mirror of
https://github.com/iconify/iconify.git
synced 2025-01-23 07:08:34 +00:00
fix(utils): bugged emoji regex
This commit is contained in:
parent
92a2624d98
commit
e443b2ae12
@ -3,7 +3,7 @@
|
|||||||
"type": "module",
|
"type": "module",
|
||||||
"description": "Common functions for working with Iconify icon sets used by various packages.",
|
"description": "Common functions for working with Iconify icon sets used by various packages.",
|
||||||
"author": "Vjacheslav Trushkin",
|
"author": "Vjacheslav Trushkin",
|
||||||
"version": "2.0.6",
|
"version": "2.0.7",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"bugs": "https://github.com/iconify/iconify/issues",
|
"bugs": "https://github.com/iconify/iconify/issues",
|
||||||
"homepage": "https://iconify.design/",
|
"homepage": "https://iconify.design/",
|
||||||
|
@ -222,13 +222,12 @@ export function mergeSimilarRegexItemSequences(
|
|||||||
length = 1;
|
length = 1;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
length = slice;
|
|
||||||
|
|
||||||
if (item.type !== 'sequence') {
|
if (item.type !== 'sequence') {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Unexpected partial match for type "${item.type}"`
|
`Unexpected partial match for type "${item.type}"`
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
length = type === 'start' ? slice : item.items.length - slice;
|
||||||
|
|
||||||
// Copy remaining chunks
|
// Copy remaining chunks
|
||||||
differentSequences.push(
|
differentSequences.push(
|
||||||
@ -267,7 +266,9 @@ export function mergeSimilarRegexItemSequences(
|
|||||||
sequence =
|
sequence =
|
||||||
type === 'start'
|
type === 'start'
|
||||||
? commonItem.items.slice(0, longestMatch)
|
? commonItem.items.slice(0, longestMatch)
|
||||||
: commonItem.items.slice(longestMatch);
|
: commonItem.items.slice(
|
||||||
|
commonItem.items.length - longestMatch
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Merge other chunks
|
// Merge other chunks
|
||||||
|
@ -156,7 +156,9 @@ export function parseEmojiTree(items: TreeItem[]): EmojiItemRegex {
|
|||||||
if (parsedItems.length === 1) {
|
if (parsedItems.length === 1) {
|
||||||
return parsedItems[0];
|
return parsedItems[0];
|
||||||
}
|
}
|
||||||
return mergeSimilarItemsInSet(createSetEmojiRegexItem(parsedItems));
|
const set = createSetEmojiRegexItem(parsedItems);
|
||||||
|
const result = mergeSimilarItemsInSet(set);
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
function parseItemChildren(item: TreeItem): ParsedTreeItem {
|
function parseItemChildren(item: TreeItem): ParsedTreeItem {
|
||||||
|
@ -1,10 +1,58 @@
|
|||||||
import { createOptimisedRegex } from '../lib/emoji/regex/create';
|
import { readFile, writeFile, unlink } from 'node:fs/promises';
|
||||||
|
import { parseEmojiTestFile } from '../lib/emoji/test/parse';
|
||||||
|
import { emojiVersion } from '../lib/emoji/data';
|
||||||
|
import {
|
||||||
|
createOptimisedRegex,
|
||||||
|
createOptimisedRegexForEmojiSequences,
|
||||||
|
} from '../lib/emoji/regex/create';
|
||||||
import {
|
import {
|
||||||
getEmojiMatchesInText,
|
getEmojiMatchesInText,
|
||||||
sortEmojiMatchesInText,
|
sortEmojiMatchesInText,
|
||||||
} from '../lib/emoji/replace/find';
|
} from '../lib/emoji/replace/find';
|
||||||
|
import { getQualifiedEmojiVariations } from '../lib/emoji/test/variations';
|
||||||
|
import { getEmojiSequenceString } from '../lib/emoji/format';
|
||||||
|
|
||||||
describe('Finding emojis in text', () => {
|
describe('Finding emojis in text', () => {
|
||||||
|
async function fetchEmojiTestData(): Promise<string | undefined> {
|
||||||
|
// Fetch emojis, cache it
|
||||||
|
const source = `tests/fixtures/download-emoji-${emojiVersion}.txt`;
|
||||||
|
|
||||||
|
let data: string | undefined;
|
||||||
|
try {
|
||||||
|
data = await readFile(source, 'utf8');
|
||||||
|
} catch {
|
||||||
|
//
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!data) {
|
||||||
|
data = (
|
||||||
|
await fetch(
|
||||||
|
`https://unicode.org/Public/emoji/${emojiVersion}/emoji-test.txt`
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.text()
|
||||||
|
.toString();
|
||||||
|
await writeFile(source, data, 'utf8');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test content, unlink cache on failure
|
||||||
|
if (data.indexOf(`# Version: ${emojiVersion}`) === -1) {
|
||||||
|
try {
|
||||||
|
await unlink(source);
|
||||||
|
} catch {
|
||||||
|
//
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
let data: string | undefined;
|
||||||
|
|
||||||
|
beforeAll(async () => {
|
||||||
|
data = await fetchEmojiTestData();
|
||||||
|
});
|
||||||
|
|
||||||
it('Simple regex', () => {
|
it('Simple regex', () => {
|
||||||
const regexValue = createOptimisedRegex([
|
const regexValue = createOptimisedRegex([
|
||||||
'1F600',
|
'1F600',
|
||||||
@ -325,4 +373,87 @@ describe('Finding emojis in text', () => {
|
|||||||
},
|
},
|
||||||
]);
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('Finding all test emojis', () => {
|
||||||
|
if (!data) {
|
||||||
|
console.warn('Test skipped: test data is not available');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse test data
|
||||||
|
const testData = parseEmojiTestFile(data);
|
||||||
|
const sequences = Object.values(testData).map(({ sequence }) => {
|
||||||
|
return {
|
||||||
|
sequence,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
// Get all icons
|
||||||
|
const iconsList = getQualifiedEmojiVariations(sequences, testData);
|
||||||
|
|
||||||
|
// Get regex
|
||||||
|
const regexValue = createOptimisedRegexForEmojiSequences(
|
||||||
|
iconsList.map((item) => item.sequence)
|
||||||
|
);
|
||||||
|
const regex = new RegExp(regexValue, 'g');
|
||||||
|
|
||||||
|
sequences.forEach((sequence) => {
|
||||||
|
const text = sequence.sequence
|
||||||
|
.map((code) => String.fromCodePoint(code))
|
||||||
|
.join('');
|
||||||
|
|
||||||
|
// Test finding match
|
||||||
|
const result = getEmojiMatchesInText(regex, text);
|
||||||
|
|
||||||
|
// Must have only 1 item
|
||||||
|
if (result.length !== 1) {
|
||||||
|
console.log(
|
||||||
|
getEmojiSequenceString(sequence.sequence),
|
||||||
|
`(\\u${getEmojiSequenceString(sequence.sequence, {
|
||||||
|
format: 'utf-16',
|
||||||
|
separator: '\\u',
|
||||||
|
case: 'upper',
|
||||||
|
})})`,
|
||||||
|
text
|
||||||
|
);
|
||||||
|
result.forEach((match) => {
|
||||||
|
const sequence: number[] = [];
|
||||||
|
for (const codePoint of match.match) {
|
||||||
|
const num = codePoint.codePointAt(0) as number;
|
||||||
|
sequence.push(num);
|
||||||
|
}
|
||||||
|
console.log(
|
||||||
|
getEmojiSequenceString(sequence),
|
||||||
|
`(\\u${getEmojiSequenceString(sequence, {
|
||||||
|
format: 'utf-16',
|
||||||
|
separator: '\\u',
|
||||||
|
case: 'upper',
|
||||||
|
})})`
|
||||||
|
);
|
||||||
|
});
|
||||||
|
console.log(result);
|
||||||
|
expect(result.length).toBe(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const firstMatch = result[0];
|
||||||
|
const resultSequence = [];
|
||||||
|
for (const codePoint of firstMatch.match) {
|
||||||
|
const num = codePoint.codePointAt(0) as number;
|
||||||
|
resultSequence.push(num);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (resultSequence.length !== sequence.sequence.length) {
|
||||||
|
console.log(
|
||||||
|
getEmojiSequenceString(sequence.sequence),
|
||||||
|
`(\\u${getEmojiSequenceString(sequence.sequence, {
|
||||||
|
format: 'utf-16',
|
||||||
|
separator: '\\u',
|
||||||
|
case: 'upper',
|
||||||
|
})})`,
|
||||||
|
result
|
||||||
|
);
|
||||||
|
}
|
||||||
|
expect(resultSequence).toEqual(sequence.sequence);
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
@ -446,4 +446,44 @@ describe('Similar chunks of regex', () => {
|
|||||||
group: false,
|
group: false,
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('Same end match', () => {
|
||||||
|
const items = [
|
||||||
|
createRegexForNumbersSequence([128139, 8205, 129489, 127996]),
|
||||||
|
createRegexForNumbersSequence([129489, 127996]),
|
||||||
|
];
|
||||||
|
|
||||||
|
const merge = findSimilarRegexItemSequences(items);
|
||||||
|
expect(merge).toEqual({
|
||||||
|
score: 24,
|
||||||
|
sequences: [
|
||||||
|
{
|
||||||
|
type: 'end',
|
||||||
|
slices: [
|
||||||
|
{
|
||||||
|
index: 0,
|
||||||
|
slice: 3,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
index: 1,
|
||||||
|
slice: 'full',
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
|
||||||
|
const sequence = merge?.sequences[0];
|
||||||
|
if (!sequence) {
|
||||||
|
throw new Error('Unexpected undefined sequence');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Merge items
|
||||||
|
const merged = mergeSimilarRegexItemSequences(items, sequence);
|
||||||
|
|
||||||
|
expect(merged.length).toBe(1);
|
||||||
|
expect(merged[0].regex).toBe(
|
||||||
|
'(?:\\uD83D\\uDC8B\\u200D)?\\uD83E\\uDDD1\\uD83C\\uDFFC'
|
||||||
|
);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
@ -1,6 +1,18 @@
|
|||||||
import { createOptimisedRegex } from '../lib/emoji/regex/create';
|
import {
|
||||||
|
getEmojiSequenceFromString,
|
||||||
|
getSequenceFromEmojiStringOrKeyword,
|
||||||
|
} from '../lib/emoji/cleanup';
|
||||||
|
import {
|
||||||
|
createOptimisedRegex,
|
||||||
|
createOptimisedRegexForEmojiSequences,
|
||||||
|
} from '../lib/emoji/regex/create';
|
||||||
|
|
||||||
describe('Emoji regex matching', () => {
|
describe('Emoji regex matching', () => {
|
||||||
|
function code(value: string): string {
|
||||||
|
const sequence = getSequenceFromEmojiStringOrKeyword(value);
|
||||||
|
return sequence.map((code) => String.fromCodePoint(code)).join('');
|
||||||
|
}
|
||||||
|
|
||||||
it('Simple regex', () => {
|
it('Simple regex', () => {
|
||||||
const regexValue = createOptimisedRegex(['1F600', '1F603', '1F604']);
|
const regexValue = createOptimisedRegex(['1F600', '1F603', '1F604']);
|
||||||
|
|
||||||
@ -50,29 +62,17 @@ Tabby cat: :tabby_cat:
|
|||||||
|
|
||||||
const matches = `
|
const matches = `
|
||||||
E0.6 dashing away: ${String.fromCodePoint(0x1f4a8)}
|
E0.6 dashing away: ${String.fromCodePoint(0x1f4a8)}
|
||||||
E13.1 face exhaling: ${
|
E13.1 face exhaling: ${code('1f62e-200d-1f4a8')}
|
||||||
String.fromCodePoint(0x1f62e) +
|
|
||||||
String.fromCodePoint(0x200d) +
|
|
||||||
String.fromCodePoint(0x1f4a8)
|
|
||||||
}
|
|
||||||
E1.0 face with open mouth: ${String.fromCodePoint(0x1f62e)}
|
E1.0 face with open mouth: ${String.fromCodePoint(0x1f62e)}
|
||||||
E0.6 smiling face: ${
|
E0.6 smiling face: ${code('263a-fe0f')} (icon)
|
||||||
String.fromCodePoint(0x263a) + String.fromCodePoint(0xfe0f)
|
|
||||||
} (icon)
|
|
||||||
E0.6 smiling face: ${String.fromCodePoint(0x263a)} (text)
|
E0.6 smiling face: ${String.fromCodePoint(0x263a)} (text)
|
||||||
`.match(new RegExp(regexValue, 'g'));
|
`.match(new RegExp(regexValue, 'g'));
|
||||||
|
|
||||||
expect(matches?.length).toBe(5);
|
expect(matches?.length).toBe(5);
|
||||||
expect(matches?.[0]).toBe(String.fromCodePoint(0x1f4a8));
|
expect(matches?.[0]).toBe(code('1f4a8'));
|
||||||
expect(matches?.[1]).toBe(
|
expect(matches?.[1]).toBe(code('1f62e 200d 1f4a8'));
|
||||||
String.fromCodePoint(0x1f62e) +
|
|
||||||
String.fromCodePoint(0x200d) +
|
|
||||||
String.fromCodePoint(0x1f4a8)
|
|
||||||
);
|
|
||||||
expect(matches?.[2]).toBe(String.fromCodePoint(0x1f62e));
|
expect(matches?.[2]).toBe(String.fromCodePoint(0x1f62e));
|
||||||
expect(matches?.[3]).toBe(
|
expect(matches?.[3]).toBe(code('263a fe0f'));
|
||||||
String.fromCodePoint(0x263a) + String.fromCodePoint(0xfe0f)
|
|
||||||
);
|
|
||||||
expect(matches?.[4]).toBe(String.fromCodePoint(0x263a));
|
expect(matches?.[4]).toBe(String.fromCodePoint(0x263a));
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -116,4 +116,24 @@ E1.0 waving hand: medium skin tone: ${
|
|||||||
String.fromCodePoint(0x1f44b) + String.fromCodePoint(0x1f3fd)
|
String.fromCodePoint(0x1f44b) + String.fromCodePoint(0x1f3fd)
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('Bugged mix of sequences', () => {
|
||||||
|
const fullList = [
|
||||||
|
'1f9d1-1f3fb-200d-2764-fe0f-200d-1f48b-200d-1f9d1-1f3fc',
|
||||||
|
'1f9d1-1f3fb-200d-2764-fe0f-200d-1f9d1-1f3fc',
|
||||||
|
];
|
||||||
|
|
||||||
|
const regexValue = createOptimisedRegexForEmojiSequences(
|
||||||
|
fullList.map((code) => getEmojiSequenceFromString(code))
|
||||||
|
);
|
||||||
|
|
||||||
|
const matches = code(
|
||||||
|
'1f9d1-1f3fb-200d-2764-fe0f-200d-1f48b-200d-1f9d1-1f3fc'
|
||||||
|
).match(new RegExp(regexValue, 'g'));
|
||||||
|
|
||||||
|
expect(matches?.length).toBe(1);
|
||||||
|
expect(matches?.[0]).toBe(
|
||||||
|
code('1f9d1-1f3fb-200d-2764-fe0f-200d-1f48b-200d-1f9d1-1f3fc')
|
||||||
|
);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
@ -221,4 +221,57 @@ describe('Emoji regex tree', () => {
|
|||||||
// '\\uD83E\\uDEF1\\uD83C(?:\\uDFFB|\\uDFFC)(?:\\u200D\\uD83E\\uDEF2\\uD83C[\\uDFFC-\\uDFFF])?'
|
// '\\uD83E\\uDEF1\\uD83C(?:\\uDFFB|\\uDFFC)(?:\\u200D\\uD83E\\uDEF2\\uD83C[\\uDFFC-\\uDFFF])?'
|
||||||
// );
|
// );
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('Multiple children with same last child', () => {
|
||||||
|
const numbers = [
|
||||||
|
getEmojiSequenceFromString(
|
||||||
|
'1f9d1-1f3fb-200d-2764-fe0f-200d-1f48b-200d-1f9d1-1f3fc'
|
||||||
|
),
|
||||||
|
getEmojiSequenceFromString(
|
||||||
|
'1f9d1-1f3fb-200d-2764-fe0f-200d-1f9d1-1f3fc'
|
||||||
|
),
|
||||||
|
];
|
||||||
|
const tree = createEmojisTree(numbers);
|
||||||
|
expect(tree).toEqual([
|
||||||
|
{
|
||||||
|
regex: createRegexForNumbersSequence([0x1f9d1, 0x1f3fb]),
|
||||||
|
children: [
|
||||||
|
{
|
||||||
|
regex: createRegexForNumbersSequence([0x2764, 0xfe0f]),
|
||||||
|
children: [
|
||||||
|
{
|
||||||
|
regex: createRegexForNumbersSequence([0x1f48b]),
|
||||||
|
children: [
|
||||||
|
{
|
||||||
|
regex: createRegexForNumbersSequence([
|
||||||
|
0x1f9d1, 0x1f3fc,
|
||||||
|
]),
|
||||||
|
end: true,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
regex: createRegexForNumbersSequence([
|
||||||
|
0x1f9d1, 0x1f3fc,
|
||||||
|
]),
|
||||||
|
end: true,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
|
||||||
|
// 'D83E-DDD1-D83C-DFFB-200D-2764-FE0F-200D-D83D-DC8B-200D-D83E-DDD1-D83C-DFFC' +
|
||||||
|
// 'D83E-DDD1-D83C-DFFB-200D-2764-FE0F-200D-D83E-DDD1-D83C-DFFC' =
|
||||||
|
// 'D83E-DDD1-D83C-DFFB-200D-2764-FE0F-200D' + 'D83D-DC8B-200D'? + 'D83E-DDD1-D83C-DFFC' +
|
||||||
|
expect(parseEmojiTree(tree).regex).toEqual(
|
||||||
|
// First common chunk
|
||||||
|
'\\uD83E\\uDDD1\\uD83C\\uDFFB\\u200D\\u2764\\uFE0F?\\u200D' +
|
||||||
|
// Optional chunk
|
||||||
|
'(?:\\uD83D\\uDC8B\\u200D)?' +
|
||||||
|
// Last common chunk
|
||||||
|
'\\uD83E\\uDDD1\\uD83C\\uDFFC'
|
||||||
|
);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
Loading…
x
Reference in New Issue
Block a user