2
0
mirror of https://github.com/iconify/iconify.git synced 2025-01-06 07:20:40 +00:00

fix(utils): bugged emoji regex

This commit is contained in:
Vjacheslav Trushkin 2023-01-01 17:40:07 +02:00
parent 92a2624d98
commit e443b2ae12
7 changed files with 271 additions and 24 deletions

View File

@ -3,7 +3,7 @@
"type": "module",
"description": "Common functions for working with Iconify icon sets used by various packages.",
"author": "Vjacheslav Trushkin",
"version": "2.0.6",
"version": "2.0.7",
"license": "MIT",
"bugs": "https://github.com/iconify/iconify/issues",
"homepage": "https://iconify.design/",

View File

@ -222,13 +222,12 @@ export function mergeSimilarRegexItemSequences(
length = 1;
}
} else {
length = slice;
if (item.type !== 'sequence') {
throw new Error(
`Unexpected partial match for type "${item.type}"`
);
}
length = type === 'start' ? slice : item.items.length - slice;
// Copy remaining chunks
differentSequences.push(
@ -267,7 +266,9 @@ export function mergeSimilarRegexItemSequences(
sequence =
type === 'start'
? commonItem.items.slice(0, longestMatch)
: commonItem.items.slice(longestMatch);
: commonItem.items.slice(
commonItem.items.length - longestMatch
);
}
// Merge other chunks

View File

@ -156,7 +156,9 @@ export function parseEmojiTree(items: TreeItem[]): EmojiItemRegex {
if (parsedItems.length === 1) {
return parsedItems[0];
}
return mergeSimilarItemsInSet(createSetEmojiRegexItem(parsedItems));
const set = createSetEmojiRegexItem(parsedItems);
const result = mergeSimilarItemsInSet(set);
return result;
}
function parseItemChildren(item: TreeItem): ParsedTreeItem {

View File

@ -1,10 +1,58 @@
import { createOptimisedRegex } from '../lib/emoji/regex/create';
import { readFile, writeFile, unlink } from 'node:fs/promises';
import { parseEmojiTestFile } from '../lib/emoji/test/parse';
import { emojiVersion } from '../lib/emoji/data';
import {
createOptimisedRegex,
createOptimisedRegexForEmojiSequences,
} from '../lib/emoji/regex/create';
import {
getEmojiMatchesInText,
sortEmojiMatchesInText,
} from '../lib/emoji/replace/find';
import { getQualifiedEmojiVariations } from '../lib/emoji/test/variations';
import { getEmojiSequenceString } from '../lib/emoji/format';
describe('Finding emojis in text', () => {
async function fetchEmojiTestData(): Promise<string | undefined> {
// Fetch emojis, cache it
const source = `tests/fixtures/download-emoji-${emojiVersion}.txt`;
let data: string | undefined;
try {
data = await readFile(source, 'utf8');
} catch {
//
}
if (!data) {
data = (
await fetch(
`https://unicode.org/Public/emoji/${emojiVersion}/emoji-test.txt`
)
)
.text()
.toString();
await writeFile(source, data, 'utf8');
}
// Test content, unlink cache on failure
if (data.indexOf(`# Version: ${emojiVersion}`) === -1) {
try {
await unlink(source);
} catch {
//
}
return;
}
return data;
}
let data: string | undefined;
beforeAll(async () => {
data = await fetchEmojiTestData();
});
it('Simple regex', () => {
const regexValue = createOptimisedRegex([
'1F600',
@ -325,4 +373,87 @@ describe('Finding emojis in text', () => {
},
]);
});
it('Finding all test emojis', () => {
if (!data) {
console.warn('Test skipped: test data is not available');
return;
}
// Parse test data
const testData = parseEmojiTestFile(data);
const sequences = Object.values(testData).map(({ sequence }) => {
return {
sequence,
};
});
// Get all icons
const iconsList = getQualifiedEmojiVariations(sequences, testData);
// Get regex
const regexValue = createOptimisedRegexForEmojiSequences(
iconsList.map((item) => item.sequence)
);
const regex = new RegExp(regexValue, 'g');
sequences.forEach((sequence) => {
const text = sequence.sequence
.map((code) => String.fromCodePoint(code))
.join('');
// Test finding match
const result = getEmojiMatchesInText(regex, text);
// Must have only 1 item
if (result.length !== 1) {
console.log(
getEmojiSequenceString(sequence.sequence),
`(\\u${getEmojiSequenceString(sequence.sequence, {
format: 'utf-16',
separator: '\\u',
case: 'upper',
})})`,
text
);
result.forEach((match) => {
const sequence: number[] = [];
for (const codePoint of match.match) {
const num = codePoint.codePointAt(0) as number;
sequence.push(num);
}
console.log(
getEmojiSequenceString(sequence),
`(\\u${getEmojiSequenceString(sequence, {
format: 'utf-16',
separator: '\\u',
case: 'upper',
})})`
);
});
console.log(result);
expect(result.length).toBe(1);
}
const firstMatch = result[0];
const resultSequence = [];
for (const codePoint of firstMatch.match) {
const num = codePoint.codePointAt(0) as number;
resultSequence.push(num);
}
if (resultSequence.length !== sequence.sequence.length) {
console.log(
getEmojiSequenceString(sequence.sequence),
`(\\u${getEmojiSequenceString(sequence.sequence, {
format: 'utf-16',
separator: '\\u',
case: 'upper',
})})`,
result
);
}
expect(resultSequence).toEqual(sequence.sequence);
});
});
});

View File

@ -446,4 +446,44 @@ describe('Similar chunks of regex', () => {
group: false,
});
});
it('Same end match', () => {
const items = [
createRegexForNumbersSequence([128139, 8205, 129489, 127996]),
createRegexForNumbersSequence([129489, 127996]),
];
const merge = findSimilarRegexItemSequences(items);
expect(merge).toEqual({
score: 24,
sequences: [
{
type: 'end',
slices: [
{
index: 0,
slice: 3,
},
{
index: 1,
slice: 'full',
},
],
},
],
});
const sequence = merge?.sequences[0];
if (!sequence) {
throw new Error('Unexpected undefined sequence');
}
// Merge items
const merged = mergeSimilarRegexItemSequences(items, sequence);
expect(merged.length).toBe(1);
expect(merged[0].regex).toBe(
'(?:\\uD83D\\uDC8B\\u200D)?\\uD83E\\uDDD1\\uD83C\\uDFFC'
);
});
});

View File

@ -1,6 +1,18 @@
import { createOptimisedRegex } from '../lib/emoji/regex/create';
import {
getEmojiSequenceFromString,
getSequenceFromEmojiStringOrKeyword,
} from '../lib/emoji/cleanup';
import {
createOptimisedRegex,
createOptimisedRegexForEmojiSequences,
} from '../lib/emoji/regex/create';
describe('Emoji regex matching', () => {
function code(value: string): string {
const sequence = getSequenceFromEmojiStringOrKeyword(value);
return sequence.map((code) => String.fromCodePoint(code)).join('');
}
it('Simple regex', () => {
const regexValue = createOptimisedRegex(['1F600', '1F603', '1F604']);
@ -50,29 +62,17 @@ Tabby cat: :tabby_cat:
const matches = `
E0.6 dashing away: ${String.fromCodePoint(0x1f4a8)}
E13.1 face exhaling: ${
String.fromCodePoint(0x1f62e) +
String.fromCodePoint(0x200d) +
String.fromCodePoint(0x1f4a8)
}
E13.1 face exhaling: ${code('1f62e-200d-1f4a8')}
E1.0 face with open mouth: ${String.fromCodePoint(0x1f62e)}
E0.6 smiling face: ${
String.fromCodePoint(0x263a) + String.fromCodePoint(0xfe0f)
} (icon)
E0.6 smiling face: ${code('263a-fe0f')} (icon)
E0.6 smiling face: ${String.fromCodePoint(0x263a)} (text)
`.match(new RegExp(regexValue, 'g'));
expect(matches?.length).toBe(5);
expect(matches?.[0]).toBe(String.fromCodePoint(0x1f4a8));
expect(matches?.[1]).toBe(
String.fromCodePoint(0x1f62e) +
String.fromCodePoint(0x200d) +
String.fromCodePoint(0x1f4a8)
);
expect(matches?.[0]).toBe(code('1f4a8'));
expect(matches?.[1]).toBe(code('1f62e 200d 1f4a8'));
expect(matches?.[2]).toBe(String.fromCodePoint(0x1f62e));
expect(matches?.[3]).toBe(
String.fromCodePoint(0x263a) + String.fromCodePoint(0xfe0f)
);
expect(matches?.[3]).toBe(code('263a fe0f'));
expect(matches?.[4]).toBe(String.fromCodePoint(0x263a));
});
@ -116,4 +116,24 @@ E1.0 waving hand: medium skin tone: ${
String.fromCodePoint(0x1f44b) + String.fromCodePoint(0x1f3fd)
);
});
it('Bugged mix of sequences', () => {
const fullList = [
'1f9d1-1f3fb-200d-2764-fe0f-200d-1f48b-200d-1f9d1-1f3fc',
'1f9d1-1f3fb-200d-2764-fe0f-200d-1f9d1-1f3fc',
];
const regexValue = createOptimisedRegexForEmojiSequences(
fullList.map((code) => getEmojiSequenceFromString(code))
);
const matches = code(
'1f9d1-1f3fb-200d-2764-fe0f-200d-1f48b-200d-1f9d1-1f3fc'
).match(new RegExp(regexValue, 'g'));
expect(matches?.length).toBe(1);
expect(matches?.[0]).toBe(
code('1f9d1-1f3fb-200d-2764-fe0f-200d-1f48b-200d-1f9d1-1f3fc')
);
});
});

View File

@ -221,4 +221,57 @@ describe('Emoji regex tree', () => {
// '\\uD83E\\uDEF1\\uD83C(?:\\uDFFB|\\uDFFC)(?:\\u200D\\uD83E\\uDEF2\\uD83C[\\uDFFC-\\uDFFF])?'
// );
});
it('Multiple children with same last child', () => {
const numbers = [
getEmojiSequenceFromString(
'1f9d1-1f3fb-200d-2764-fe0f-200d-1f48b-200d-1f9d1-1f3fc'
),
getEmojiSequenceFromString(
'1f9d1-1f3fb-200d-2764-fe0f-200d-1f9d1-1f3fc'
),
];
const tree = createEmojisTree(numbers);
expect(tree).toEqual([
{
regex: createRegexForNumbersSequence([0x1f9d1, 0x1f3fb]),
children: [
{
regex: createRegexForNumbersSequence([0x2764, 0xfe0f]),
children: [
{
regex: createRegexForNumbersSequence([0x1f48b]),
children: [
{
regex: createRegexForNumbersSequence([
0x1f9d1, 0x1f3fc,
]),
end: true,
},
],
},
{
regex: createRegexForNumbersSequence([
0x1f9d1, 0x1f3fc,
]),
end: true,
},
],
},
],
},
]);
// 'D83E-DDD1-D83C-DFFB-200D-2764-FE0F-200D-D83D-DC8B-200D-D83E-DDD1-D83C-DFFC' +
// 'D83E-DDD1-D83C-DFFB-200D-2764-FE0F-200D-D83E-DDD1-D83C-DFFC' =
// 'D83E-DDD1-D83C-DFFB-200D-2764-FE0F-200D' + 'D83D-DC8B-200D'? + 'D83E-DDD1-D83C-DFFC' +
expect(parseEmojiTree(tree).regex).toEqual(
// First common chunk
'\\uD83E\\uDDD1\\uD83C\\uDFFB\\u200D\\u2764\\uFE0F?\\u200D' +
// Optional chunk
'(?:\\uD83D\\uDC8B\\u200D)?' +
// Last common chunk
'\\uD83E\\uDDD1\\uD83C\\uDFFC'
);
});
});