mirror of
https://github.com/iconify/iconify.git
synced 2025-01-07 15:44:05 +00:00
fix(utils): bugged emoji regex
This commit is contained in:
parent
92a2624d98
commit
e443b2ae12
@ -3,7 +3,7 @@
|
||||
"type": "module",
|
||||
"description": "Common functions for working with Iconify icon sets used by various packages.",
|
||||
"author": "Vjacheslav Trushkin",
|
||||
"version": "2.0.6",
|
||||
"version": "2.0.7",
|
||||
"license": "MIT",
|
||||
"bugs": "https://github.com/iconify/iconify/issues",
|
||||
"homepage": "https://iconify.design/",
|
||||
|
@ -222,13 +222,12 @@ export function mergeSimilarRegexItemSequences(
|
||||
length = 1;
|
||||
}
|
||||
} else {
|
||||
length = slice;
|
||||
|
||||
if (item.type !== 'sequence') {
|
||||
throw new Error(
|
||||
`Unexpected partial match for type "${item.type}"`
|
||||
);
|
||||
}
|
||||
length = type === 'start' ? slice : item.items.length - slice;
|
||||
|
||||
// Copy remaining chunks
|
||||
differentSequences.push(
|
||||
@ -267,7 +266,9 @@ export function mergeSimilarRegexItemSequences(
|
||||
sequence =
|
||||
type === 'start'
|
||||
? commonItem.items.slice(0, longestMatch)
|
||||
: commonItem.items.slice(longestMatch);
|
||||
: commonItem.items.slice(
|
||||
commonItem.items.length - longestMatch
|
||||
);
|
||||
}
|
||||
|
||||
// Merge other chunks
|
||||
|
@ -156,7 +156,9 @@ export function parseEmojiTree(items: TreeItem[]): EmojiItemRegex {
|
||||
if (parsedItems.length === 1) {
|
||||
return parsedItems[0];
|
||||
}
|
||||
return mergeSimilarItemsInSet(createSetEmojiRegexItem(parsedItems));
|
||||
const set = createSetEmojiRegexItem(parsedItems);
|
||||
const result = mergeSimilarItemsInSet(set);
|
||||
return result;
|
||||
}
|
||||
|
||||
function parseItemChildren(item: TreeItem): ParsedTreeItem {
|
||||
|
@ -1,10 +1,58 @@
|
||||
import { createOptimisedRegex } from '../lib/emoji/regex/create';
|
||||
import { readFile, writeFile, unlink } from 'node:fs/promises';
|
||||
import { parseEmojiTestFile } from '../lib/emoji/test/parse';
|
||||
import { emojiVersion } from '../lib/emoji/data';
|
||||
import {
|
||||
createOptimisedRegex,
|
||||
createOptimisedRegexForEmojiSequences,
|
||||
} from '../lib/emoji/regex/create';
|
||||
import {
|
||||
getEmojiMatchesInText,
|
||||
sortEmojiMatchesInText,
|
||||
} from '../lib/emoji/replace/find';
|
||||
import { getQualifiedEmojiVariations } from '../lib/emoji/test/variations';
|
||||
import { getEmojiSequenceString } from '../lib/emoji/format';
|
||||
|
||||
describe('Finding emojis in text', () => {
|
||||
async function fetchEmojiTestData(): Promise<string | undefined> {
|
||||
// Fetch emojis, cache it
|
||||
const source = `tests/fixtures/download-emoji-${emojiVersion}.txt`;
|
||||
|
||||
let data: string | undefined;
|
||||
try {
|
||||
data = await readFile(source, 'utf8');
|
||||
} catch {
|
||||
//
|
||||
}
|
||||
|
||||
if (!data) {
|
||||
data = (
|
||||
await fetch(
|
||||
`https://unicode.org/Public/emoji/${emojiVersion}/emoji-test.txt`
|
||||
)
|
||||
)
|
||||
.text()
|
||||
.toString();
|
||||
await writeFile(source, data, 'utf8');
|
||||
}
|
||||
|
||||
// Test content, unlink cache on failure
|
||||
if (data.indexOf(`# Version: ${emojiVersion}`) === -1) {
|
||||
try {
|
||||
await unlink(source);
|
||||
} catch {
|
||||
//
|
||||
}
|
||||
return;
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
let data: string | undefined;
|
||||
|
||||
beforeAll(async () => {
|
||||
data = await fetchEmojiTestData();
|
||||
});
|
||||
|
||||
it('Simple regex', () => {
|
||||
const regexValue = createOptimisedRegex([
|
||||
'1F600',
|
||||
@ -325,4 +373,87 @@ describe('Finding emojis in text', () => {
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('Finding all test emojis', () => {
|
||||
if (!data) {
|
||||
console.warn('Test skipped: test data is not available');
|
||||
return;
|
||||
}
|
||||
|
||||
// Parse test data
|
||||
const testData = parseEmojiTestFile(data);
|
||||
const sequences = Object.values(testData).map(({ sequence }) => {
|
||||
return {
|
||||
sequence,
|
||||
};
|
||||
});
|
||||
|
||||
// Get all icons
|
||||
const iconsList = getQualifiedEmojiVariations(sequences, testData);
|
||||
|
||||
// Get regex
|
||||
const regexValue = createOptimisedRegexForEmojiSequences(
|
||||
iconsList.map((item) => item.sequence)
|
||||
);
|
||||
const regex = new RegExp(regexValue, 'g');
|
||||
|
||||
sequences.forEach((sequence) => {
|
||||
const text = sequence.sequence
|
||||
.map((code) => String.fromCodePoint(code))
|
||||
.join('');
|
||||
|
||||
// Test finding match
|
||||
const result = getEmojiMatchesInText(regex, text);
|
||||
|
||||
// Must have only 1 item
|
||||
if (result.length !== 1) {
|
||||
console.log(
|
||||
getEmojiSequenceString(sequence.sequence),
|
||||
`(\\u${getEmojiSequenceString(sequence.sequence, {
|
||||
format: 'utf-16',
|
||||
separator: '\\u',
|
||||
case: 'upper',
|
||||
})})`,
|
||||
text
|
||||
);
|
||||
result.forEach((match) => {
|
||||
const sequence: number[] = [];
|
||||
for (const codePoint of match.match) {
|
||||
const num = codePoint.codePointAt(0) as number;
|
||||
sequence.push(num);
|
||||
}
|
||||
console.log(
|
||||
getEmojiSequenceString(sequence),
|
||||
`(\\u${getEmojiSequenceString(sequence, {
|
||||
format: 'utf-16',
|
||||
separator: '\\u',
|
||||
case: 'upper',
|
||||
})})`
|
||||
);
|
||||
});
|
||||
console.log(result);
|
||||
expect(result.length).toBe(1);
|
||||
}
|
||||
|
||||
const firstMatch = result[0];
|
||||
const resultSequence = [];
|
||||
for (const codePoint of firstMatch.match) {
|
||||
const num = codePoint.codePointAt(0) as number;
|
||||
resultSequence.push(num);
|
||||
}
|
||||
|
||||
if (resultSequence.length !== sequence.sequence.length) {
|
||||
console.log(
|
||||
getEmojiSequenceString(sequence.sequence),
|
||||
`(\\u${getEmojiSequenceString(sequence.sequence, {
|
||||
format: 'utf-16',
|
||||
separator: '\\u',
|
||||
case: 'upper',
|
||||
})})`,
|
||||
result
|
||||
);
|
||||
}
|
||||
expect(resultSequence).toEqual(sequence.sequence);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
@ -446,4 +446,44 @@ describe('Similar chunks of regex', () => {
|
||||
group: false,
|
||||
});
|
||||
});
|
||||
|
||||
it('Same end match', () => {
|
||||
const items = [
|
||||
createRegexForNumbersSequence([128139, 8205, 129489, 127996]),
|
||||
createRegexForNumbersSequence([129489, 127996]),
|
||||
];
|
||||
|
||||
const merge = findSimilarRegexItemSequences(items);
|
||||
expect(merge).toEqual({
|
||||
score: 24,
|
||||
sequences: [
|
||||
{
|
||||
type: 'end',
|
||||
slices: [
|
||||
{
|
||||
index: 0,
|
||||
slice: 3,
|
||||
},
|
||||
{
|
||||
index: 1,
|
||||
slice: 'full',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const sequence = merge?.sequences[0];
|
||||
if (!sequence) {
|
||||
throw new Error('Unexpected undefined sequence');
|
||||
}
|
||||
|
||||
// Merge items
|
||||
const merged = mergeSimilarRegexItemSequences(items, sequence);
|
||||
|
||||
expect(merged.length).toBe(1);
|
||||
expect(merged[0].regex).toBe(
|
||||
'(?:\\uD83D\\uDC8B\\u200D)?\\uD83E\\uDDD1\\uD83C\\uDFFC'
|
||||
);
|
||||
});
|
||||
});
|
||||
|
@ -1,6 +1,18 @@
|
||||
import { createOptimisedRegex } from '../lib/emoji/regex/create';
|
||||
import {
|
||||
getEmojiSequenceFromString,
|
||||
getSequenceFromEmojiStringOrKeyword,
|
||||
} from '../lib/emoji/cleanup';
|
||||
import {
|
||||
createOptimisedRegex,
|
||||
createOptimisedRegexForEmojiSequences,
|
||||
} from '../lib/emoji/regex/create';
|
||||
|
||||
describe('Emoji regex matching', () => {
|
||||
function code(value: string): string {
|
||||
const sequence = getSequenceFromEmojiStringOrKeyword(value);
|
||||
return sequence.map((code) => String.fromCodePoint(code)).join('');
|
||||
}
|
||||
|
||||
it('Simple regex', () => {
|
||||
const regexValue = createOptimisedRegex(['1F600', '1F603', '1F604']);
|
||||
|
||||
@ -50,29 +62,17 @@ Tabby cat: :tabby_cat:
|
||||
|
||||
const matches = `
|
||||
E0.6 dashing away: ${String.fromCodePoint(0x1f4a8)}
|
||||
E13.1 face exhaling: ${
|
||||
String.fromCodePoint(0x1f62e) +
|
||||
String.fromCodePoint(0x200d) +
|
||||
String.fromCodePoint(0x1f4a8)
|
||||
}
|
||||
E13.1 face exhaling: ${code('1f62e-200d-1f4a8')}
|
||||
E1.0 face with open mouth: ${String.fromCodePoint(0x1f62e)}
|
||||
E0.6 smiling face: ${
|
||||
String.fromCodePoint(0x263a) + String.fromCodePoint(0xfe0f)
|
||||
} (icon)
|
||||
E0.6 smiling face: ${code('263a-fe0f')} (icon)
|
||||
E0.6 smiling face: ${String.fromCodePoint(0x263a)} (text)
|
||||
`.match(new RegExp(regexValue, 'g'));
|
||||
|
||||
expect(matches?.length).toBe(5);
|
||||
expect(matches?.[0]).toBe(String.fromCodePoint(0x1f4a8));
|
||||
expect(matches?.[1]).toBe(
|
||||
String.fromCodePoint(0x1f62e) +
|
||||
String.fromCodePoint(0x200d) +
|
||||
String.fromCodePoint(0x1f4a8)
|
||||
);
|
||||
expect(matches?.[0]).toBe(code('1f4a8'));
|
||||
expect(matches?.[1]).toBe(code('1f62e 200d 1f4a8'));
|
||||
expect(matches?.[2]).toBe(String.fromCodePoint(0x1f62e));
|
||||
expect(matches?.[3]).toBe(
|
||||
String.fromCodePoint(0x263a) + String.fromCodePoint(0xfe0f)
|
||||
);
|
||||
expect(matches?.[3]).toBe(code('263a fe0f'));
|
||||
expect(matches?.[4]).toBe(String.fromCodePoint(0x263a));
|
||||
});
|
||||
|
||||
@ -116,4 +116,24 @@ E1.0 waving hand: medium skin tone: ${
|
||||
String.fromCodePoint(0x1f44b) + String.fromCodePoint(0x1f3fd)
|
||||
);
|
||||
});
|
||||
|
||||
it('Bugged mix of sequences', () => {
|
||||
const fullList = [
|
||||
'1f9d1-1f3fb-200d-2764-fe0f-200d-1f48b-200d-1f9d1-1f3fc',
|
||||
'1f9d1-1f3fb-200d-2764-fe0f-200d-1f9d1-1f3fc',
|
||||
];
|
||||
|
||||
const regexValue = createOptimisedRegexForEmojiSequences(
|
||||
fullList.map((code) => getEmojiSequenceFromString(code))
|
||||
);
|
||||
|
||||
const matches = code(
|
||||
'1f9d1-1f3fb-200d-2764-fe0f-200d-1f48b-200d-1f9d1-1f3fc'
|
||||
).match(new RegExp(regexValue, 'g'));
|
||||
|
||||
expect(matches?.length).toBe(1);
|
||||
expect(matches?.[0]).toBe(
|
||||
code('1f9d1-1f3fb-200d-2764-fe0f-200d-1f48b-200d-1f9d1-1f3fc')
|
||||
);
|
||||
});
|
||||
});
|
||||
|
@ -221,4 +221,57 @@ describe('Emoji regex tree', () => {
|
||||
// '\\uD83E\\uDEF1\\uD83C(?:\\uDFFB|\\uDFFC)(?:\\u200D\\uD83E\\uDEF2\\uD83C[\\uDFFC-\\uDFFF])?'
|
||||
// );
|
||||
});
|
||||
|
||||
it('Multiple children with same last child', () => {
|
||||
const numbers = [
|
||||
getEmojiSequenceFromString(
|
||||
'1f9d1-1f3fb-200d-2764-fe0f-200d-1f48b-200d-1f9d1-1f3fc'
|
||||
),
|
||||
getEmojiSequenceFromString(
|
||||
'1f9d1-1f3fb-200d-2764-fe0f-200d-1f9d1-1f3fc'
|
||||
),
|
||||
];
|
||||
const tree = createEmojisTree(numbers);
|
||||
expect(tree).toEqual([
|
||||
{
|
||||
regex: createRegexForNumbersSequence([0x1f9d1, 0x1f3fb]),
|
||||
children: [
|
||||
{
|
||||
regex: createRegexForNumbersSequence([0x2764, 0xfe0f]),
|
||||
children: [
|
||||
{
|
||||
regex: createRegexForNumbersSequence([0x1f48b]),
|
||||
children: [
|
||||
{
|
||||
regex: createRegexForNumbersSequence([
|
||||
0x1f9d1, 0x1f3fc,
|
||||
]),
|
||||
end: true,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
regex: createRegexForNumbersSequence([
|
||||
0x1f9d1, 0x1f3fc,
|
||||
]),
|
||||
end: true,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
]);
|
||||
|
||||
// 'D83E-DDD1-D83C-DFFB-200D-2764-FE0F-200D-D83D-DC8B-200D-D83E-DDD1-D83C-DFFC' +
|
||||
// 'D83E-DDD1-D83C-DFFB-200D-2764-FE0F-200D-D83E-DDD1-D83C-DFFC' =
|
||||
// 'D83E-DDD1-D83C-DFFB-200D-2764-FE0F-200D' + 'D83D-DC8B-200D'? + 'D83E-DDD1-D83C-DFFC' +
|
||||
expect(parseEmojiTree(tree).regex).toEqual(
|
||||
// First common chunk
|
||||
'\\uD83E\\uDDD1\\uD83C\\uDFFB\\u200D\\u2764\\uFE0F?\\u200D' +
|
||||
// Optional chunk
|
||||
'(?:\\uD83D\\uDC8B\\u200D)?' +
|
||||
// Last common chunk
|
||||
'\\uD83E\\uDDD1\\uD83C\\uDFFC'
|
||||
);
|
||||
});
|
||||
});
|
||||
|
Loading…
Reference in New Issue
Block a user