From 4562f1dfc95ea2643ba53186d1364e71591ae719 Mon Sep 17 00:00:00 2001 From: Yuji Uchiyama Date: Sat, 24 Feb 2018 19:17:35 +0900 Subject: [PATCH] Add a SvmDataset class for SVM-Light (or LibSVM) format files (#237) * Add data loader for svm format * Add tests for error cases * Set proper exception messages * Add documents * Add error checking code for invalid column format * Add missing documents --- README.md | 1 + docs/index.md | 7 +- docs/machine-learning/datasets/svm-dataset.md | 13 ++ mkdocs.yml | 1 + src/Dataset/SvmDataset.php | 130 +++++++++++ src/Exception/DatasetException.php | 15 ++ tests/Dataset/Resources/svm/1x1.svm | 1 + tests/Dataset/Resources/svm/3x1.svm | 3 + tests/Dataset/Resources/svm/3x4.svm | 3 + tests/Dataset/Resources/svm/comments.svm | 2 + tests/Dataset/Resources/svm/empty.svm | 0 .../Dataset/Resources/svm/err_empty_line.svm | 3 + .../Dataset/Resources/svm/err_index_zero.svm | 1 + .../Resources/svm/err_invalid_feature.svm | 1 + .../Resources/svm/err_invalid_spaces.svm | 1 + .../Resources/svm/err_invalid_value.svm | 1 + tests/Dataset/Resources/svm/err_no_labels.svm | 1 + .../Resources/svm/err_string_index.svm | 1 + .../Resources/svm/err_string_labels.svm | 1 + tests/Dataset/Resources/svm/sparse.svm | 2 + tests/Dataset/Resources/svm/tabs.svm | 1 + tests/Dataset/SvmDatasetTest.php | 212 ++++++++++++++++++ 22 files changed, 398 insertions(+), 3 deletions(-) create mode 100644 docs/machine-learning/datasets/svm-dataset.md create mode 100644 src/Dataset/SvmDataset.php create mode 100644 tests/Dataset/Resources/svm/1x1.svm create mode 100644 tests/Dataset/Resources/svm/3x1.svm create mode 100644 tests/Dataset/Resources/svm/3x4.svm create mode 100644 tests/Dataset/Resources/svm/comments.svm create mode 100644 tests/Dataset/Resources/svm/empty.svm create mode 100644 tests/Dataset/Resources/svm/err_empty_line.svm create mode 100644 tests/Dataset/Resources/svm/err_index_zero.svm create mode 100644 tests/Dataset/Resources/svm/err_invalid_feature.svm create mode 100644 tests/Dataset/Resources/svm/err_invalid_spaces.svm create mode 100644 tests/Dataset/Resources/svm/err_invalid_value.svm create mode 100644 tests/Dataset/Resources/svm/err_no_labels.svm create mode 100644 tests/Dataset/Resources/svm/err_string_index.svm create mode 100644 tests/Dataset/Resources/svm/err_string_labels.svm create mode 100644 tests/Dataset/Resources/svm/sparse.svm create mode 100644 tests/Dataset/Resources/svm/tabs.svm create mode 100644 tests/Dataset/SvmDatasetTest.php diff --git a/README.md b/README.md index c5d788e..ddca60a 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples]( * [Array](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/array-dataset/) * [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/) * [Files](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/files-dataset/) + * [SVM](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/svm-dataset/) * Ready to use: * [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/) * [Wine](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/wine/) diff --git a/docs/index.md b/docs/index.md index 14cfba5..2e204e8 100644 --- a/docs/index.md +++ b/docs/index.md @@ -12,7 +12,7 @@ - + ![PHP-ML - Machine Learning library for PHP](assets/php-ml-logo.png) Fresh approach to Machine Learning in PHP. Algorithms, Cross Validation, Neural Network, Preprocessing, Feature Extraction and much more in one library. @@ -31,7 +31,7 @@ $labels = ['a', 'a', 'a', 'b', 'b', 'b']; $classifier = new KNearestNeighbors(); $classifier->train($samples, $labels); -$classifier->predict([3, 2]); +$classifier->predict([3, 2]); // return 'b' ``` @@ -89,6 +89,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples]( * [Array](machine-learning/datasets/array-dataset.md) * [CSV](machine-learning/datasets/csv-dataset.md) * [Files](machine-learning/datasets/files-dataset.md) + * [SVM](machine-learning/datasets/svm-dataset.md) * Ready to use: * [Iris](machine-learning/datasets/demo/iris.md) * [Wine](machine-learning/datasets/demo/wine.md) @@ -100,7 +101,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples]( * [Matrix](math/matrix.md) * [Set](math/set.md) * [Statistic](math/statistic.md) - + ## Contribute diff --git a/docs/machine-learning/datasets/svm-dataset.md b/docs/machine-learning/datasets/svm-dataset.md new file mode 100644 index 0000000..8ac1c26 --- /dev/null +++ b/docs/machine-learning/datasets/svm-dataset.md @@ -0,0 +1,13 @@ +# SvmDataset + +Helper class that loads data from SVM-Light format file. It extends the `ArrayDataset`. + +### Constructors Parameters + +* $filepath - (string) path to the file + +``` +$dataset = new SvmDataset('dataset.svm'); +``` + +See [ArrayDataset](array-dataset.md) for more information. diff --git a/mkdocs.yml b/mkdocs.yml index 92f3837..490e5dc 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -38,6 +38,7 @@ pages: - Array Dataset: machine-learning/datasets/array-dataset.md - CSV Dataset: machine-learning/datasets/csv-dataset.md - Files Dataset: machine-learning/datasets/files-dataset.md + - SVM Dataset: machine-learning/datasets/svm-dataset.md - Ready to use datasets: - Iris: machine-learning/datasets/demo/iris.md - Wine: machine-learning/datasets/demo/wine.md diff --git a/src/Dataset/SvmDataset.php b/src/Dataset/SvmDataset.php new file mode 100644 index 0000000..8bd172b --- /dev/null +++ b/src/Dataset/SvmDataset.php @@ -0,0 +1,130 @@ + $maxIndex) { + $maxIndex = $index; + $sample = array_pad($sample, $maxIndex + 1, 0); + } + + $sample[$index] = $value; + } + + return [$sample, $target, $maxIndex]; + } + + private static function parseLine(string $line): array + { + $line = explode('#', $line, 2)[0]; + $line = rtrim($line); + $line = str_replace("\t", ' ', $line); + + $columns = explode(' ', $line); + + return $columns; + } + + private static function parseTargetColumn(string $column): float + { + if (!is_numeric($column)) { + throw DatasetException::invalidTarget($column); + } + + return (float) $column; + } + + private static function parseFeatureColumn(string $column): array + { + $feature = explode(':', $column, 2); + if (count($feature) != 2) { + throw DatasetException::invalidValue($column); + } + + $index = self::parseFeatureIndex($feature[0]); + $value = self::parseFeatureValue($feature[1]); + + return [$index, $value]; + } + + private static function parseFeatureIndex(string $index): int + { + if (!is_numeric($index) || !ctype_digit($index)) { + throw DatasetException::invalidIndex($index); + } + + if ((int) $index < 1) { + throw DatasetException::invalidIndex($index); + } + + return (int) $index - 1; + } + + private static function parseFeatureValue(string $value): float + { + if (!is_numeric($value)) { + throw DatasetException::invalidValue($value); + } + + return (float) $value; + } +} diff --git a/src/Exception/DatasetException.php b/src/Exception/DatasetException.php index 8d6d5da..1cb0bfc 100644 --- a/src/Exception/DatasetException.php +++ b/src/Exception/DatasetException.php @@ -12,4 +12,19 @@ class DatasetException extends Exception { return new self(sprintf('Dataset root folder "%s" missing.', $path)); } + + public static function invalidTarget(string $target): self + { + return new self(sprintf('Invalid target "%s".', $target)); + } + + public static function invalidIndex(string $index): self + { + return new self(sprintf('Invalid index "%s".', $index)); + } + + public static function invalidValue(string $value): self + { + return new self(sprintf('Invalid value "%s".', $value)); + } } diff --git a/tests/Dataset/Resources/svm/1x1.svm b/tests/Dataset/Resources/svm/1x1.svm new file mode 100644 index 0000000..fdd6c1f --- /dev/null +++ b/tests/Dataset/Resources/svm/1x1.svm @@ -0,0 +1 @@ +0 1:2.3 diff --git a/tests/Dataset/Resources/svm/3x1.svm b/tests/Dataset/Resources/svm/3x1.svm new file mode 100644 index 0000000..d817c96 --- /dev/null +++ b/tests/Dataset/Resources/svm/3x1.svm @@ -0,0 +1,3 @@ +1 1:2.3 +0 1:4.56 +1 1:78.9 diff --git a/tests/Dataset/Resources/svm/3x4.svm b/tests/Dataset/Resources/svm/3x4.svm new file mode 100644 index 0000000..5f6d015 --- /dev/null +++ b/tests/Dataset/Resources/svm/3x4.svm @@ -0,0 +1,3 @@ +1 1:2 2:4 3:6 4:8 +2 1:3 2:5 3:7 4:9 +0 1:1.2 2:3.4 3:5.6 4:7.8 diff --git a/tests/Dataset/Resources/svm/comments.svm b/tests/Dataset/Resources/svm/comments.svm new file mode 100644 index 0000000..7cf6fc4 --- /dev/null +++ b/tests/Dataset/Resources/svm/comments.svm @@ -0,0 +1,2 @@ +0 1:2 # This is a comment. +1 1:34 # This # is # : # also # a # comment # . diff --git a/tests/Dataset/Resources/svm/empty.svm b/tests/Dataset/Resources/svm/empty.svm new file mode 100644 index 0000000..e69de29 diff --git a/tests/Dataset/Resources/svm/err_empty_line.svm b/tests/Dataset/Resources/svm/err_empty_line.svm new file mode 100644 index 0000000..289e2b5 --- /dev/null +++ b/tests/Dataset/Resources/svm/err_empty_line.svm @@ -0,0 +1,3 @@ +1 1:2.3 + +0 1:4.56 diff --git a/tests/Dataset/Resources/svm/err_index_zero.svm b/tests/Dataset/Resources/svm/err_index_zero.svm new file mode 100644 index 0000000..56c20f8 --- /dev/null +++ b/tests/Dataset/Resources/svm/err_index_zero.svm @@ -0,0 +1 @@ +0 0:2.3 diff --git a/tests/Dataset/Resources/svm/err_invalid_feature.svm b/tests/Dataset/Resources/svm/err_invalid_feature.svm new file mode 100644 index 0000000..f57b6c5 --- /dev/null +++ b/tests/Dataset/Resources/svm/err_invalid_feature.svm @@ -0,0 +1 @@ +0 12345 diff --git a/tests/Dataset/Resources/svm/err_invalid_spaces.svm b/tests/Dataset/Resources/svm/err_invalid_spaces.svm new file mode 100644 index 0000000..77ff868 --- /dev/null +++ b/tests/Dataset/Resources/svm/err_invalid_spaces.svm @@ -0,0 +1 @@ + 0 1:2.3 diff --git a/tests/Dataset/Resources/svm/err_invalid_value.svm b/tests/Dataset/Resources/svm/err_invalid_value.svm new file mode 100644 index 0000000..b358890 --- /dev/null +++ b/tests/Dataset/Resources/svm/err_invalid_value.svm @@ -0,0 +1 @@ +0 1:xyz diff --git a/tests/Dataset/Resources/svm/err_no_labels.svm b/tests/Dataset/Resources/svm/err_no_labels.svm new file mode 100644 index 0000000..789be38 --- /dev/null +++ b/tests/Dataset/Resources/svm/err_no_labels.svm @@ -0,0 +1 @@ +1:2.3 diff --git a/tests/Dataset/Resources/svm/err_string_index.svm b/tests/Dataset/Resources/svm/err_string_index.svm new file mode 100644 index 0000000..25cb296 --- /dev/null +++ b/tests/Dataset/Resources/svm/err_string_index.svm @@ -0,0 +1 @@ +0 x:2.3 diff --git a/tests/Dataset/Resources/svm/err_string_labels.svm b/tests/Dataset/Resources/svm/err_string_labels.svm new file mode 100644 index 0000000..8cc16f7 --- /dev/null +++ b/tests/Dataset/Resources/svm/err_string_labels.svm @@ -0,0 +1 @@ +A 1:2.3 diff --git a/tests/Dataset/Resources/svm/sparse.svm b/tests/Dataset/Resources/svm/sparse.svm new file mode 100644 index 0000000..23d7485 --- /dev/null +++ b/tests/Dataset/Resources/svm/sparse.svm @@ -0,0 +1,2 @@ +0 2:3.45 +1 5:6.789 diff --git a/tests/Dataset/Resources/svm/tabs.svm b/tests/Dataset/Resources/svm/tabs.svm new file mode 100644 index 0000000..bf8757f --- /dev/null +++ b/tests/Dataset/Resources/svm/tabs.svm @@ -0,0 +1 @@ +1 1:23 2:45 # comments diff --git a/tests/Dataset/SvmDatasetTest.php b/tests/Dataset/SvmDatasetTest.php new file mode 100644 index 0000000..884da3a --- /dev/null +++ b/tests/Dataset/SvmDatasetTest.php @@ -0,0 +1,212 @@ +assertEquals($expectedSamples, $dataset->getSamples()); + $this->assertEquals($expectedTargets, $dataset->getTargets()); + } + + public function testSvmDataset1x1(): void + { + $filePath = self::getFilePath('1x1.svm'); + $dataset = new SvmDataset($filePath); + + $expectedSamples = [ + [2.3], + ]; + $expectedTargets = [ + 0, + ]; + + $this->assertEquals($expectedSamples, $dataset->getSamples()); + $this->assertEquals($expectedTargets, $dataset->getTargets()); + } + + public function testSvmDataset3x1(): void + { + $filePath = self::getFilePath('3x1.svm'); + $dataset = new SvmDataset($filePath); + + $expectedSamples = [ + [2.3], + [4.56], + [78.9], + ]; + $expectedTargets = [ + 1, + 0, + 1, + ]; + + $this->assertEquals($expectedSamples, $dataset->getSamples()); + $this->assertEquals($expectedTargets, $dataset->getTargets()); + } + + public function testSvmDataset3x4(): void + { + $filePath = self::getFilePath('3x4.svm'); + $dataset = new SvmDataset($filePath); + + $expectedSamples = [ + [2, 4, 6, 8], + [3, 5, 7, 9], + [1.2, 3.4, 5.6, 7.8], + ]; + $expectedTargets = [ + 1, + 2, + 0, + ]; + + $this->assertEquals($expectedSamples, $dataset->getSamples()); + $this->assertEquals($expectedTargets, $dataset->getTargets()); + } + + public function testSvmDatasetSparse(): void + { + $filePath = self::getFilePath('sparse.svm'); + $dataset = new SvmDataset($filePath); + + $expectedSamples = [ + [0, 3.45, 0, 0, 0], + [0, 0, 0, 0, 6.789], + ]; + $expectedTargets = [ + 0, + 1, + ]; + + $this->assertEquals($expectedSamples, $dataset->getSamples()); + $this->assertEquals($expectedTargets, $dataset->getTargets()); + } + + public function testSvmDatasetComments(): void + { + $filePath = self::getFilePath('comments.svm'); + $dataset = new SvmDataset($filePath); + + $expectedSamples = [ + [2], + [34], + ]; + $expectedTargets = [ + 0, + 1, + ]; + + $this->assertEquals($expectedSamples, $dataset->getSamples()); + $this->assertEquals($expectedTargets, $dataset->getTargets()); + } + + public function testSvmDatasetTabs(): void + { + $filePath = self::getFilePath('tabs.svm'); + $dataset = new SvmDataset($filePath); + + $expectedSamples = [ + [23, 45], + ]; + $expectedTargets = [ + 1, + ]; + + $this->assertEquals($expectedSamples, $dataset->getSamples()); + $this->assertEquals($expectedTargets, $dataset->getTargets()); + } + + public function testSvmDatasetMissingFile(): void + { + $this->expectException(FileException::class); + + $filePath = self::getFilePath('err_file_not_exists.svm'); + $dataset = new SvmDataset($filePath); + } + + public function testSvmDatasetEmptyLine(): void + { + $this->expectException(DatasetException::class); + + $filePath = self::getFilePath('err_empty_line.svm'); + $dataset = new SvmDataset($filePath); + } + + public function testSvmDatasetNoLabels(): void + { + $this->expectException(DatasetException::class); + + $filePath = self::getFilePath('err_no_labels.svm'); + $dataset = new SvmDataset($filePath); + } + + public function testSvmDatasetStringLabels(): void + { + $this->expectException(DatasetException::class); + + $filePath = self::getFilePath('err_string_labels.svm'); + $dataset = new SvmDataset($filePath); + } + + public function testSvmDatasetInvalidSpaces(): void + { + $this->expectException(DatasetException::class); + + $filePath = self::getFilePath('err_invalid_spaces.svm'); + $dataset = new SvmDataset($filePath); + } + + public function testSvmDatasetStringIndex(): void + { + $this->expectException(DatasetException::class); + + $filePath = self::getFilePath('err_string_index.svm'); + $dataset = new SvmDataset($filePath); + } + + public function testSvmDatasetIndexZero(): void + { + $this->expectException(DatasetException::class); + + $filePath = self::getFilePath('err_index_zero.svm'); + $dataset = new SvmDataset($filePath); + } + + public function testSvmDatasetInvalidValue(): void + { + $this->expectException(DatasetException::class); + + $filePath = self::getFilePath('err_invalid_value.svm'); + $dataset = new SvmDataset($filePath); + } + + public function testSvmDatasetInvalidFeature(): void + { + $this->expectException(DatasetException::class); + + $filePath = self::getFilePath('err_invalid_feature.svm'); + $dataset = new SvmDataset($filePath); + } + + private static function getFilePath(string $baseName): string + { + return dirname(__FILE__).'/Resources/svm/'.$baseName; + } +}