Add a SvmDataset class for SVM-Light (or LibSVM) format files (#237)

* Add data loader for svm format

* Add tests for error cases

* Set proper exception messages

* Add documents

* Add error checking code for invalid column format

* Add missing documents
This commit is contained in:
Yuji Uchiyama 2018-02-24 19:17:35 +09:00 committed by Arkadiusz Kondas
parent a96f03e8dd
commit 4562f1dfc9
22 changed files with 398 additions and 3 deletions

View File

@ -104,6 +104,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples](
* [Array](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/array-dataset/) * [Array](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/array-dataset/)
* [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/) * [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/)
* [Files](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/files-dataset/) * [Files](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/files-dataset/)
* [SVM](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/svm-dataset/)
* Ready to use: * Ready to use:
* [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/) * [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/)
* [Wine](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/wine/) * [Wine](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/wine/)

View File

@ -12,7 +12,7 @@
<a href="http://www.yegor256.com/2016/10/23/award-2017.html"> <a href="http://www.yegor256.com/2016/10/23/award-2017.html">
<img src="http://www.yegor256.com/images/award/2017/winner-itcraftsmanpl.png" <img src="http://www.yegor256.com/images/award/2017/winner-itcraftsmanpl.png"
style="width:203px;height:45px;"/></a> style="width:203px;height:45px;"/></a>
![PHP-ML - Machine Learning library for PHP](assets/php-ml-logo.png) ![PHP-ML - Machine Learning library for PHP](assets/php-ml-logo.png)
Fresh approach to Machine Learning in PHP. Algorithms, Cross Validation, Neural Network, Preprocessing, Feature Extraction and much more in one library. Fresh approach to Machine Learning in PHP. Algorithms, Cross Validation, Neural Network, Preprocessing, Feature Extraction and much more in one library.
@ -31,7 +31,7 @@ $labels = ['a', 'a', 'a', 'b', 'b', 'b'];
$classifier = new KNearestNeighbors(); $classifier = new KNearestNeighbors();
$classifier->train($samples, $labels); $classifier->train($samples, $labels);
$classifier->predict([3, 2]); $classifier->predict([3, 2]);
// return 'b' // return 'b'
``` ```
@ -89,6 +89,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples](
* [Array](machine-learning/datasets/array-dataset.md) * [Array](machine-learning/datasets/array-dataset.md)
* [CSV](machine-learning/datasets/csv-dataset.md) * [CSV](machine-learning/datasets/csv-dataset.md)
* [Files](machine-learning/datasets/files-dataset.md) * [Files](machine-learning/datasets/files-dataset.md)
* [SVM](machine-learning/datasets/svm-dataset.md)
* Ready to use: * Ready to use:
* [Iris](machine-learning/datasets/demo/iris.md) * [Iris](machine-learning/datasets/demo/iris.md)
* [Wine](machine-learning/datasets/demo/wine.md) * [Wine](machine-learning/datasets/demo/wine.md)
@ -100,7 +101,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples](
* [Matrix](math/matrix.md) * [Matrix](math/matrix.md)
* [Set](math/set.md) * [Set](math/set.md)
* [Statistic](math/statistic.md) * [Statistic](math/statistic.md)
## Contribute ## Contribute

View File

@ -0,0 +1,13 @@
# SvmDataset
Helper class that loads data from SVM-Light format file. It extends the `ArrayDataset`.
### Constructors Parameters
* $filepath - (string) path to the file
```
$dataset = new SvmDataset('dataset.svm');
```
See [ArrayDataset](array-dataset.md) for more information.

View File

@ -38,6 +38,7 @@ pages:
- Array Dataset: machine-learning/datasets/array-dataset.md - Array Dataset: machine-learning/datasets/array-dataset.md
- CSV Dataset: machine-learning/datasets/csv-dataset.md - CSV Dataset: machine-learning/datasets/csv-dataset.md
- Files Dataset: machine-learning/datasets/files-dataset.md - Files Dataset: machine-learning/datasets/files-dataset.md
- SVM Dataset: machine-learning/datasets/svm-dataset.md
- Ready to use datasets: - Ready to use datasets:
- Iris: machine-learning/datasets/demo/iris.md - Iris: machine-learning/datasets/demo/iris.md
- Wine: machine-learning/datasets/demo/wine.md - Wine: machine-learning/datasets/demo/wine.md

130
src/Dataset/SvmDataset.php Normal file
View File

@ -0,0 +1,130 @@
<?php
declare(strict_types=1);
namespace Phpml\Dataset;
use Phpml\Exception\DatasetException;
use Phpml\Exception\FileException;
class SvmDataset extends ArrayDataset
{
public function __construct(string $filePath)
{
[$samples, $targets] = self::readProblem($filePath);
parent::__construct($samples, $targets);
}
private static function readProblem(string $filePath): array
{
$handle = self::openFile($filePath);
$samples = [];
$targets = [];
$maxIndex = 0;
while (($line = fgets($handle)) !== false) {
[$sample, $target, $maxIndex] = self::processLine($line, $maxIndex);
$samples[] = $sample;
$targets[] = $target;
}
fclose($handle);
foreach ($samples as &$sample) {
$sample = array_pad($sample, $maxIndex + 1, 0);
}
return [$samples, $targets];
}
private static function openFile(string $filePath)
{
if (!file_exists($filePath)) {
throw FileException::missingFile(basename($filePath));
}
$handle = fopen($filePath, 'rb');
if ($handle === false) {
throw FileException::cantOpenFile(basename($filePath));
}
return $handle;
}
private static function processLine(string $line, int $maxIndex): array
{
$columns = self::parseLine($line);
$target = self::parseTargetColumn($columns[0]);
$sample = array_fill(0, $maxIndex + 1, 0);
$n = count($columns);
for ($i = 1; $i < $n; ++$i) {
[$index, $value] = self::parseFeatureColumn($columns[$i]);
if ($index > $maxIndex) {
$maxIndex = $index;
$sample = array_pad($sample, $maxIndex + 1, 0);
}
$sample[$index] = $value;
}
return [$sample, $target, $maxIndex];
}
private static function parseLine(string $line): array
{
$line = explode('#', $line, 2)[0];
$line = rtrim($line);
$line = str_replace("\t", ' ', $line);
$columns = explode(' ', $line);
return $columns;
}
private static function parseTargetColumn(string $column): float
{
if (!is_numeric($column)) {
throw DatasetException::invalidTarget($column);
}
return (float) $column;
}
private static function parseFeatureColumn(string $column): array
{
$feature = explode(':', $column, 2);
if (count($feature) != 2) {
throw DatasetException::invalidValue($column);
}
$index = self::parseFeatureIndex($feature[0]);
$value = self::parseFeatureValue($feature[1]);
return [$index, $value];
}
private static function parseFeatureIndex(string $index): int
{
if (!is_numeric($index) || !ctype_digit($index)) {
throw DatasetException::invalidIndex($index);
}
if ((int) $index < 1) {
throw DatasetException::invalidIndex($index);
}
return (int) $index - 1;
}
private static function parseFeatureValue(string $value): float
{
if (!is_numeric($value)) {
throw DatasetException::invalidValue($value);
}
return (float) $value;
}
}

View File

@ -12,4 +12,19 @@ class DatasetException extends Exception
{ {
return new self(sprintf('Dataset root folder "%s" missing.', $path)); return new self(sprintf('Dataset root folder "%s" missing.', $path));
} }
public static function invalidTarget(string $target): self
{
return new self(sprintf('Invalid target "%s".', $target));
}
public static function invalidIndex(string $index): self
{
return new self(sprintf('Invalid index "%s".', $index));
}
public static function invalidValue(string $value): self
{
return new self(sprintf('Invalid value "%s".', $value));
}
} }

View File

@ -0,0 +1 @@
0 1:2.3

View File

@ -0,0 +1,3 @@
1 1:2.3
0 1:4.56
1 1:78.9

View File

@ -0,0 +1,3 @@
1 1:2 2:4 3:6 4:8
2 1:3 2:5 3:7 4:9
0 1:1.2 2:3.4 3:5.6 4:7.8

View File

@ -0,0 +1,2 @@
0 1:2 # This is a comment.
1 1:34 # This # is # : # also # a # comment # .

View File

View File

@ -0,0 +1,3 @@
1 1:2.3
0 1:4.56

View File

@ -0,0 +1 @@
0 0:2.3

View File

@ -0,0 +1 @@
0 12345

View File

@ -0,0 +1 @@
0 1:2.3

View File

@ -0,0 +1 @@
0 1:xyz

View File

@ -0,0 +1 @@
1:2.3

View File

@ -0,0 +1 @@
0 x:2.3

View File

@ -0,0 +1 @@
A 1:2.3

View File

@ -0,0 +1,2 @@
0 2:3.45
1 5:6.789

View File

@ -0,0 +1 @@
1 1:23 2:45 # comments

View File

@ -0,0 +1,212 @@
<?php
declare(strict_types=1);
namespace Phpml\Tests\Dataset;
use Phpml\Dataset\SvmDataset;
use Phpml\Exception\DatasetException;
use Phpml\Exception\FileException;
use PHPUnit\Framework\TestCase;
class SvmDatasetTest extends TestCase
{
public function testSvmDatasetEmpty(): void
{
$filePath = self::getFilePath('empty.svm');
$dataset = new SvmDataset($filePath);
$expectedSamples = [
];
$expectedTargets = [
];
$this->assertEquals($expectedSamples, $dataset->getSamples());
$this->assertEquals($expectedTargets, $dataset->getTargets());
}
public function testSvmDataset1x1(): void
{
$filePath = self::getFilePath('1x1.svm');
$dataset = new SvmDataset($filePath);
$expectedSamples = [
[2.3],
];
$expectedTargets = [
0,
];
$this->assertEquals($expectedSamples, $dataset->getSamples());
$this->assertEquals($expectedTargets, $dataset->getTargets());
}
public function testSvmDataset3x1(): void
{
$filePath = self::getFilePath('3x1.svm');
$dataset = new SvmDataset($filePath);
$expectedSamples = [
[2.3],
[4.56],
[78.9],
];
$expectedTargets = [
1,
0,
1,
];
$this->assertEquals($expectedSamples, $dataset->getSamples());
$this->assertEquals($expectedTargets, $dataset->getTargets());
}
public function testSvmDataset3x4(): void
{
$filePath = self::getFilePath('3x4.svm');
$dataset = new SvmDataset($filePath);
$expectedSamples = [
[2, 4, 6, 8],
[3, 5, 7, 9],
[1.2, 3.4, 5.6, 7.8],
];
$expectedTargets = [
1,
2,
0,
];
$this->assertEquals($expectedSamples, $dataset->getSamples());
$this->assertEquals($expectedTargets, $dataset->getTargets());
}
public function testSvmDatasetSparse(): void
{
$filePath = self::getFilePath('sparse.svm');
$dataset = new SvmDataset($filePath);
$expectedSamples = [
[0, 3.45, 0, 0, 0],
[0, 0, 0, 0, 6.789],
];
$expectedTargets = [
0,
1,
];
$this->assertEquals($expectedSamples, $dataset->getSamples());
$this->assertEquals($expectedTargets, $dataset->getTargets());
}
public function testSvmDatasetComments(): void
{
$filePath = self::getFilePath('comments.svm');
$dataset = new SvmDataset($filePath);
$expectedSamples = [
[2],
[34],
];
$expectedTargets = [
0,
1,
];
$this->assertEquals($expectedSamples, $dataset->getSamples());
$this->assertEquals($expectedTargets, $dataset->getTargets());
}
public function testSvmDatasetTabs(): void
{
$filePath = self::getFilePath('tabs.svm');
$dataset = new SvmDataset($filePath);
$expectedSamples = [
[23, 45],
];
$expectedTargets = [
1,
];
$this->assertEquals($expectedSamples, $dataset->getSamples());
$this->assertEquals($expectedTargets, $dataset->getTargets());
}
public function testSvmDatasetMissingFile(): void
{
$this->expectException(FileException::class);
$filePath = self::getFilePath('err_file_not_exists.svm');
$dataset = new SvmDataset($filePath);
}
public function testSvmDatasetEmptyLine(): void
{
$this->expectException(DatasetException::class);
$filePath = self::getFilePath('err_empty_line.svm');
$dataset = new SvmDataset($filePath);
}
public function testSvmDatasetNoLabels(): void
{
$this->expectException(DatasetException::class);
$filePath = self::getFilePath('err_no_labels.svm');
$dataset = new SvmDataset($filePath);
}
public function testSvmDatasetStringLabels(): void
{
$this->expectException(DatasetException::class);
$filePath = self::getFilePath('err_string_labels.svm');
$dataset = new SvmDataset($filePath);
}
public function testSvmDatasetInvalidSpaces(): void
{
$this->expectException(DatasetException::class);
$filePath = self::getFilePath('err_invalid_spaces.svm');
$dataset = new SvmDataset($filePath);
}
public function testSvmDatasetStringIndex(): void
{
$this->expectException(DatasetException::class);
$filePath = self::getFilePath('err_string_index.svm');
$dataset = new SvmDataset($filePath);
}
public function testSvmDatasetIndexZero(): void
{
$this->expectException(DatasetException::class);
$filePath = self::getFilePath('err_index_zero.svm');
$dataset = new SvmDataset($filePath);
}
public function testSvmDatasetInvalidValue(): void
{
$this->expectException(DatasetException::class);
$filePath = self::getFilePath('err_invalid_value.svm');
$dataset = new SvmDataset($filePath);
}
public function testSvmDatasetInvalidFeature(): void
{
$this->expectException(DatasetException::class);
$filePath = self::getFilePath('err_invalid_feature.svm');
$dataset = new SvmDataset($filePath);
}
private static function getFilePath(string $baseName): string
{
return dirname(__FILE__).'/Resources/svm/'.$baseName;
}
}