From da4ddc54387e31404428af626802cbbcb629364e Mon Sep 17 00:00:00 2001
From: Vjacheslav Trushkin <cyberalien@gmail.com>
Date: Sat, 3 Dec 2022 20:14:15 +0200
Subject: [PATCH] feat: function to parse emoji test file

---
 packages/utils/package.json                   |  5 +
 packages/utils/src/emoji/parse-test.ts        | 61 ++++++++++++
 .../tests/validate-emoji-unicode-test.ts      | 99 +++++--------------
 3 files changed, 91 insertions(+), 74 deletions(-)
 create mode 100644 packages/utils/src/emoji/parse-test.ts
diff --git a/packages/utils/package.json b/packages/utils/package.json
index bf0dca0..191759c 100644
--- a/packages/utils/package.json
+++ b/packages/utils/package.json
@@ -122,6 +122,11 @@
 			"import": "./lib/emoji/format.mjs",
 			"types": "./lib/emoji/format.d.ts"
 		},
+		"./lib/emoji/parse-test": {
+			"require": "./lib/emoji/parse-test.cjs",
+			"import": "./lib/emoji/parse-test.mjs",
+			"types": "./lib/emoji/parse-test.d.ts"
+		},
 		"./lib/icon-set/convert-info": {
 			"require": "./lib/icon-set/convert-info.cjs",
 			"import": "./lib/icon-set/convert-info.mjs",
diff --git a/packages/utils/src/emoji/parse-test.ts b/packages/utils/src/emoji/parse-test.ts
new file mode 100644
index 0000000..b651ff8
--- /dev/null
+++ b/packages/utils/src/emoji/parse-test.ts
@@ -0,0 +1,61 @@
+import { getEmojiSequenceFromString } from './cleanup';
+
+// Emoji types
+type EmojiType =
+	| 'component'
+	| 'fully-qualified'
+	| 'minimally-qualified'
+	| 'unqualified';
+const componentType: EmojiType = 'component';
+
+// Allowed types, in order of conversion
+const allowedTypes: Set<EmojiType> = new Set([
+	componentType,
+	'fully-qualified',
+	'minimally-qualified',
+	'unqualified',
+]);
+
+/**
+ * Get all emoji sequences from test file
+ *
+ * Returns dash-separated hexadecimal codes
+ */
+export function parseEmojiTestFile(data: string): number[][] {
+	const emojis: Set<string> = new Set();
+
+	// Parse all lines
+	data.split('\n').forEach((line) => {
+		line = line.trim();
+		const parts = line.split('#');
+		if (parts.length < 2) {
+			return;
+		}
+
+		// Get code and type from first chunk
+		const firstChunk = (parts.shift() as string).trim();
+		if (!firstChunk) {
+			// Empty first chunk: a comment
+			return;
+		}
+		const firstChunkParts = firstChunk.split(';');
+		if (firstChunkParts.length !== 2) {
+			return;
+		}
+		const text = firstChunkParts[0].trim();
+		const code = text.toLowerCase().replace(/\s+/g, '-');
+		if (!code || !code.match(/^[a-f0-9]+[a-f0-9-]*[a-f0-9]+$/)) {
+			return;
+		}
+		const type = firstChunkParts[1].trim() as EmojiType;
+		if (!allowedTypes.has(type)) {
+			throw new Error(`Bad emoji type: ${type}`);
+		}
+
+		// Add code
+		emojis.add(code);
+	});
+
+	// Return all emojis as sequences
+	return Array.from(emojis).map(getEmojiSequenceFromString);
+}
diff --git a/packages/utils/tests/validate-emoji-unicode-test.ts b/packages/utils/tests/validate-emoji-unicode-test.ts
index 9b052a1..a5bff24 100644
--- a/packages/utils/tests/validate-emoji-unicode-test.ts
+++ b/packages/utils/tests/validate-emoji-unicode-test.ts
@@ -5,23 +5,9 @@ import {
 	startUTF32Pair1,
 	startUTF32Pair2,
 	endUTF32Pair,
+	minUTF32,
 } from '../lib/emoji/data';
-
-// Emoji types
-type EmojiType =
-	| 'component'
-	| 'fully-qualified'
-	| 'minimally-qualified'
-	| 'unqualified';
-const componentType: EmojiType = 'component';
-
-// Allowed types, in order of conversion
-const allowedTypes: Set<EmojiType> = new Set([
-	componentType,
-	'fully-qualified',
-	'minimally-qualified',
-	'unqualified',
-]);
+import { parseEmojiTestFile } from '../lib/emoji/parse-test';
 
 describe('Testing emoji code points', () => {
 	it('Checking available ranges', async () => {
@@ -58,49 +44,15 @@ describe('Testing emoji code points', () => {
 		}
 
 		// Get all emojis
-		const utf16: Set<string> = new Set();
-		const utf32: Set<string> = new Set();
-		data.split('\n').forEach((line) => {
-			line = line.trim();
-			const parts = line.split('#');
-			if (parts.length < 2) {
-				return;
-			}
+		const utf16: Set<number> = new Set();
+		const utf32: Set<number> = new Set();
 
-			// Get code and type from first chunk
-			const firstChunk = (parts.shift() as string).trim();
-			if (!firstChunk) {
-				// Empty first chunk: a comment
-				return;
-			}
-			const firstChunkParts = firstChunk.split(';');
-			if (firstChunkParts.length !== 2) {
-				return;
-			}
-			const text = firstChunkParts[0].trim();
-			const code = text.toLowerCase().replace(/\s+/g, '-');
-			if (!code) {
-				return;
-			}
-			const type = firstChunkParts[1].trim() as EmojiType;
-			if (!allowedTypes.has(type)) {
-				throw new Error(`Bad emoji type: ${type}`);
-			}
-
-			// Add code
-			code.split('-').forEach((chunk) => {
-				switch (chunk.length) {
-					case 2:
-					case 4:
-						utf16.add(chunk);
-						break;
-
-					case 5:
-						utf32.add(chunk);
-						break;
-
-					default:
-						throw new Error(`Bad emoji code: ${text}`);
+		parseEmojiTestFile(data).forEach((sequence) => {
+			sequence.forEach((code) => {
+				if (code < minUTF32) {
+					utf16.add(code);
+				} else {
+					utf32.add(code);
 				}
 			});
 		});
@@ -129,10 +81,9 @@ describe('Testing emoji code points', () => {
 
 		// ... for UTF-16 code points
 		let utf16Range: Range | undefined;
-		utf16.forEach((str) => {
-			const code = getEmojiCodePoint(str);
+		utf16.forEach((code) => {
 			if (code > startUTF32Pair1 && code < endUTF32Pair) {
-				throw new Error(`UTF16 in UTF32 range: ${str}`);
+				throw new Error(`UTF16 in UTF32 range: ${code}`);
 			}
 			utf16Range = add(code, utf16Range);
 		});
@@ -140,27 +91,18 @@ describe('Testing emoji code points', () => {
 		// ... for UTF-32 code points
 		let utf32FirstRange: Range | undefined;
 		let utf32SecondRange: Range | undefined;
-		utf32.forEach((str) => {
-			const pair = splitUTF32Number(getEmojiCodePoint(str));
+		utf32.forEach((code) => {
+			const pair = splitUTF32Number(code);
 			if (pair) {
 				utf32FirstRange = add(pair[0], utf32FirstRange);
 				utf32SecondRange = add(pair[1], utf32SecondRange);
 			} else {
-				throw new Error(`Unexpected item in UTF32 set: ${str}`);
+				throw new Error(`Unexpected item in UTF32 set: ${code}`);
 			}
 		});
 
-		// Check UTF-32 emoji ranges
-		expect(utf32FirstRange).toBeDefined();
-		expect(utf32FirstRange!.min).toBeGreaterThanOrEqual(startUTF32Pair1);
-		expect(utf32FirstRange!.max).toBeLessThan(startUTF32Pair2);
-
-		expect(utf32SecondRange).toBeDefined();
-		expect(utf32SecondRange!.min).toBeGreaterThanOrEqual(startUTF32Pair2);
-		expect(utf32SecondRange!.max).toBeLessThan(endUTF32Pair);
-
-		// Dump ranges
 		/*
+		// Dump ranges
 		function dump(item: Range | undefined): string {
 			if (!item) {
 				return 'undefined';
@@ -172,5 +114,14 @@ describe('Testing emoji code points', () => {
 		console.log('UTF16:', dump(utf16Range));
 		console.log('UTF32:', dump(utf32FirstRange), dump(utf32SecondRange));
 		*/
+
+		// Check UTF-32 emoji ranges
+		expect(utf32FirstRange).toBeDefined();
+		expect(utf32FirstRange!.min).toBeGreaterThanOrEqual(startUTF32Pair1);
+		expect(utf32FirstRange!.max).toBeLessThan(startUTF32Pair2);
+
+		expect(utf32SecondRange).toBeDefined();
+		expect(utf32SecondRange!.min).toBeGreaterThanOrEqual(startUTF32Pair2);
+		expect(utf32SecondRange!.max).toBeLessThan(endUTF32Pair);
 	});
 });