Add a SvmDataset class for SVM-Light (or LibSVM) format files (#237)
* Add data loader for svm format * Add tests for error cases * Set proper exception messages * Add documents * Add error checking code for invalid column format * Add missing documents
This commit is contained in:
parent
a96f03e8dd
commit
4562f1dfc9
|
@ -104,6 +104,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples](
|
||||||
* [Array](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/array-dataset/)
|
* [Array](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/array-dataset/)
|
||||||
* [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/)
|
* [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/)
|
||||||
* [Files](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/files-dataset/)
|
* [Files](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/files-dataset/)
|
||||||
|
* [SVM](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/svm-dataset/)
|
||||||
* Ready to use:
|
* Ready to use:
|
||||||
* [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/)
|
* [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/)
|
||||||
* [Wine](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/wine/)
|
* [Wine](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/wine/)
|
||||||
|
|
|
@ -12,7 +12,7 @@
|
||||||
<a href="http://www.yegor256.com/2016/10/23/award-2017.html">
|
<a href="http://www.yegor256.com/2016/10/23/award-2017.html">
|
||||||
<img src="http://www.yegor256.com/images/award/2017/winner-itcraftsmanpl.png"
|
<img src="http://www.yegor256.com/images/award/2017/winner-itcraftsmanpl.png"
|
||||||
style="width:203px;height:45px;"/></a>
|
style="width:203px;height:45px;"/></a>
|
||||||
|
|
||||||
![PHP-ML - Machine Learning library for PHP](assets/php-ml-logo.png)
|
![PHP-ML - Machine Learning library for PHP](assets/php-ml-logo.png)
|
||||||
|
|
||||||
Fresh approach to Machine Learning in PHP. Algorithms, Cross Validation, Neural Network, Preprocessing, Feature Extraction and much more in one library.
|
Fresh approach to Machine Learning in PHP. Algorithms, Cross Validation, Neural Network, Preprocessing, Feature Extraction and much more in one library.
|
||||||
|
@ -31,7 +31,7 @@ $labels = ['a', 'a', 'a', 'b', 'b', 'b'];
|
||||||
$classifier = new KNearestNeighbors();
|
$classifier = new KNearestNeighbors();
|
||||||
$classifier->train($samples, $labels);
|
$classifier->train($samples, $labels);
|
||||||
|
|
||||||
$classifier->predict([3, 2]);
|
$classifier->predict([3, 2]);
|
||||||
// return 'b'
|
// return 'b'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -89,6 +89,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples](
|
||||||
* [Array](machine-learning/datasets/array-dataset.md)
|
* [Array](machine-learning/datasets/array-dataset.md)
|
||||||
* [CSV](machine-learning/datasets/csv-dataset.md)
|
* [CSV](machine-learning/datasets/csv-dataset.md)
|
||||||
* [Files](machine-learning/datasets/files-dataset.md)
|
* [Files](machine-learning/datasets/files-dataset.md)
|
||||||
|
* [SVM](machine-learning/datasets/svm-dataset.md)
|
||||||
* Ready to use:
|
* Ready to use:
|
||||||
* [Iris](machine-learning/datasets/demo/iris.md)
|
* [Iris](machine-learning/datasets/demo/iris.md)
|
||||||
* [Wine](machine-learning/datasets/demo/wine.md)
|
* [Wine](machine-learning/datasets/demo/wine.md)
|
||||||
|
@ -100,7 +101,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples](
|
||||||
* [Matrix](math/matrix.md)
|
* [Matrix](math/matrix.md)
|
||||||
* [Set](math/set.md)
|
* [Set](math/set.md)
|
||||||
* [Statistic](math/statistic.md)
|
* [Statistic](math/statistic.md)
|
||||||
|
|
||||||
|
|
||||||
## Contribute
|
## Contribute
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,13 @@
|
||||||
|
# SvmDataset
|
||||||
|
|
||||||
|
Helper class that loads data from SVM-Light format file. It extends the `ArrayDataset`.
|
||||||
|
|
||||||
|
### Constructors Parameters
|
||||||
|
|
||||||
|
* $filepath - (string) path to the file
|
||||||
|
|
||||||
|
```
|
||||||
|
$dataset = new SvmDataset('dataset.svm');
|
||||||
|
```
|
||||||
|
|
||||||
|
See [ArrayDataset](array-dataset.md) for more information.
|
|
@ -38,6 +38,7 @@ pages:
|
||||||
- Array Dataset: machine-learning/datasets/array-dataset.md
|
- Array Dataset: machine-learning/datasets/array-dataset.md
|
||||||
- CSV Dataset: machine-learning/datasets/csv-dataset.md
|
- CSV Dataset: machine-learning/datasets/csv-dataset.md
|
||||||
- Files Dataset: machine-learning/datasets/files-dataset.md
|
- Files Dataset: machine-learning/datasets/files-dataset.md
|
||||||
|
- SVM Dataset: machine-learning/datasets/svm-dataset.md
|
||||||
- Ready to use datasets:
|
- Ready to use datasets:
|
||||||
- Iris: machine-learning/datasets/demo/iris.md
|
- Iris: machine-learning/datasets/demo/iris.md
|
||||||
- Wine: machine-learning/datasets/demo/wine.md
|
- Wine: machine-learning/datasets/demo/wine.md
|
||||||
|
|
|
@ -0,0 +1,130 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
namespace Phpml\Dataset;
|
||||||
|
|
||||||
|
use Phpml\Exception\DatasetException;
|
||||||
|
use Phpml\Exception\FileException;
|
||||||
|
|
||||||
|
class SvmDataset extends ArrayDataset
|
||||||
|
{
|
||||||
|
public function __construct(string $filePath)
|
||||||
|
{
|
||||||
|
[$samples, $targets] = self::readProblem($filePath);
|
||||||
|
|
||||||
|
parent::__construct($samples, $targets);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static function readProblem(string $filePath): array
|
||||||
|
{
|
||||||
|
$handle = self::openFile($filePath);
|
||||||
|
|
||||||
|
$samples = [];
|
||||||
|
$targets = [];
|
||||||
|
$maxIndex = 0;
|
||||||
|
while (($line = fgets($handle)) !== false) {
|
||||||
|
[$sample, $target, $maxIndex] = self::processLine($line, $maxIndex);
|
||||||
|
$samples[] = $sample;
|
||||||
|
$targets[] = $target;
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose($handle);
|
||||||
|
|
||||||
|
foreach ($samples as &$sample) {
|
||||||
|
$sample = array_pad($sample, $maxIndex + 1, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
return [$samples, $targets];
|
||||||
|
}
|
||||||
|
|
||||||
|
private static function openFile(string $filePath)
|
||||||
|
{
|
||||||
|
if (!file_exists($filePath)) {
|
||||||
|
throw FileException::missingFile(basename($filePath));
|
||||||
|
}
|
||||||
|
|
||||||
|
$handle = fopen($filePath, 'rb');
|
||||||
|
if ($handle === false) {
|
||||||
|
throw FileException::cantOpenFile(basename($filePath));
|
||||||
|
}
|
||||||
|
|
||||||
|
return $handle;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static function processLine(string $line, int $maxIndex): array
|
||||||
|
{
|
||||||
|
$columns = self::parseLine($line);
|
||||||
|
|
||||||
|
$target = self::parseTargetColumn($columns[0]);
|
||||||
|
$sample = array_fill(0, $maxIndex + 1, 0);
|
||||||
|
|
||||||
|
$n = count($columns);
|
||||||
|
for ($i = 1; $i < $n; ++$i) {
|
||||||
|
[$index, $value] = self::parseFeatureColumn($columns[$i]);
|
||||||
|
if ($index > $maxIndex) {
|
||||||
|
$maxIndex = $index;
|
||||||
|
$sample = array_pad($sample, $maxIndex + 1, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
$sample[$index] = $value;
|
||||||
|
}
|
||||||
|
|
||||||
|
return [$sample, $target, $maxIndex];
|
||||||
|
}
|
||||||
|
|
||||||
|
private static function parseLine(string $line): array
|
||||||
|
{
|
||||||
|
$line = explode('#', $line, 2)[0];
|
||||||
|
$line = rtrim($line);
|
||||||
|
$line = str_replace("\t", ' ', $line);
|
||||||
|
|
||||||
|
$columns = explode(' ', $line);
|
||||||
|
|
||||||
|
return $columns;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static function parseTargetColumn(string $column): float
|
||||||
|
{
|
||||||
|
if (!is_numeric($column)) {
|
||||||
|
throw DatasetException::invalidTarget($column);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (float) $column;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static function parseFeatureColumn(string $column): array
|
||||||
|
{
|
||||||
|
$feature = explode(':', $column, 2);
|
||||||
|
if (count($feature) != 2) {
|
||||||
|
throw DatasetException::invalidValue($column);
|
||||||
|
}
|
||||||
|
|
||||||
|
$index = self::parseFeatureIndex($feature[0]);
|
||||||
|
$value = self::parseFeatureValue($feature[1]);
|
||||||
|
|
||||||
|
return [$index, $value];
|
||||||
|
}
|
||||||
|
|
||||||
|
private static function parseFeatureIndex(string $index): int
|
||||||
|
{
|
||||||
|
if (!is_numeric($index) || !ctype_digit($index)) {
|
||||||
|
throw DatasetException::invalidIndex($index);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((int) $index < 1) {
|
||||||
|
throw DatasetException::invalidIndex($index);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (int) $index - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static function parseFeatureValue(string $value): float
|
||||||
|
{
|
||||||
|
if (!is_numeric($value)) {
|
||||||
|
throw DatasetException::invalidValue($value);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (float) $value;
|
||||||
|
}
|
||||||
|
}
|
|
@ -12,4 +12,19 @@ class DatasetException extends Exception
|
||||||
{
|
{
|
||||||
return new self(sprintf('Dataset root folder "%s" missing.', $path));
|
return new self(sprintf('Dataset root folder "%s" missing.', $path));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static function invalidTarget(string $target): self
|
||||||
|
{
|
||||||
|
return new self(sprintf('Invalid target "%s".', $target));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function invalidIndex(string $index): self
|
||||||
|
{
|
||||||
|
return new self(sprintf('Invalid index "%s".', $index));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function invalidValue(string $value): self
|
||||||
|
{
|
||||||
|
return new self(sprintf('Invalid value "%s".', $value));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
0 1:2.3
|
|
@ -0,0 +1,3 @@
|
||||||
|
1 1:2.3
|
||||||
|
0 1:4.56
|
||||||
|
1 1:78.9
|
|
@ -0,0 +1,3 @@
|
||||||
|
1 1:2 2:4 3:6 4:8
|
||||||
|
2 1:3 2:5 3:7 4:9
|
||||||
|
0 1:1.2 2:3.4 3:5.6 4:7.8
|
|
@ -0,0 +1,2 @@
|
||||||
|
0 1:2 # This is a comment.
|
||||||
|
1 1:34 # This # is # : # also # a # comment # .
|
|
@ -0,0 +1,3 @@
|
||||||
|
1 1:2.3
|
||||||
|
|
||||||
|
0 1:4.56
|
|
@ -0,0 +1 @@
|
||||||
|
0 0:2.3
|
|
@ -0,0 +1 @@
|
||||||
|
0 12345
|
|
@ -0,0 +1 @@
|
||||||
|
0 1:2.3
|
|
@ -0,0 +1 @@
|
||||||
|
0 1:xyz
|
|
@ -0,0 +1 @@
|
||||||
|
1:2.3
|
|
@ -0,0 +1 @@
|
||||||
|
0 x:2.3
|
|
@ -0,0 +1 @@
|
||||||
|
A 1:2.3
|
|
@ -0,0 +1,2 @@
|
||||||
|
0 2:3.45
|
||||||
|
1 5:6.789
|
|
@ -0,0 +1 @@
|
||||||
|
1 1:23 2:45 # comments
|
|
@ -0,0 +1,212 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
declare(strict_types=1);
|
||||||
|
|
||||||
|
namespace Phpml\Tests\Dataset;
|
||||||
|
|
||||||
|
use Phpml\Dataset\SvmDataset;
|
||||||
|
use Phpml\Exception\DatasetException;
|
||||||
|
use Phpml\Exception\FileException;
|
||||||
|
use PHPUnit\Framework\TestCase;
|
||||||
|
|
||||||
|
class SvmDatasetTest extends TestCase
|
||||||
|
{
|
||||||
|
public function testSvmDatasetEmpty(): void
|
||||||
|
{
|
||||||
|
$filePath = self::getFilePath('empty.svm');
|
||||||
|
$dataset = new SvmDataset($filePath);
|
||||||
|
|
||||||
|
$expectedSamples = [
|
||||||
|
];
|
||||||
|
$expectedTargets = [
|
||||||
|
];
|
||||||
|
|
||||||
|
$this->assertEquals($expectedSamples, $dataset->getSamples());
|
||||||
|
$this->assertEquals($expectedTargets, $dataset->getTargets());
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testSvmDataset1x1(): void
|
||||||
|
{
|
||||||
|
$filePath = self::getFilePath('1x1.svm');
|
||||||
|
$dataset = new SvmDataset($filePath);
|
||||||
|
|
||||||
|
$expectedSamples = [
|
||||||
|
[2.3],
|
||||||
|
];
|
||||||
|
$expectedTargets = [
|
||||||
|
0,
|
||||||
|
];
|
||||||
|
|
||||||
|
$this->assertEquals($expectedSamples, $dataset->getSamples());
|
||||||
|
$this->assertEquals($expectedTargets, $dataset->getTargets());
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testSvmDataset3x1(): void
|
||||||
|
{
|
||||||
|
$filePath = self::getFilePath('3x1.svm');
|
||||||
|
$dataset = new SvmDataset($filePath);
|
||||||
|
|
||||||
|
$expectedSamples = [
|
||||||
|
[2.3],
|
||||||
|
[4.56],
|
||||||
|
[78.9],
|
||||||
|
];
|
||||||
|
$expectedTargets = [
|
||||||
|
1,
|
||||||
|
0,
|
||||||
|
1,
|
||||||
|
];
|
||||||
|
|
||||||
|
$this->assertEquals($expectedSamples, $dataset->getSamples());
|
||||||
|
$this->assertEquals($expectedTargets, $dataset->getTargets());
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testSvmDataset3x4(): void
|
||||||
|
{
|
||||||
|
$filePath = self::getFilePath('3x4.svm');
|
||||||
|
$dataset = new SvmDataset($filePath);
|
||||||
|
|
||||||
|
$expectedSamples = [
|
||||||
|
[2, 4, 6, 8],
|
||||||
|
[3, 5, 7, 9],
|
||||||
|
[1.2, 3.4, 5.6, 7.8],
|
||||||
|
];
|
||||||
|
$expectedTargets = [
|
||||||
|
1,
|
||||||
|
2,
|
||||||
|
0,
|
||||||
|
];
|
||||||
|
|
||||||
|
$this->assertEquals($expectedSamples, $dataset->getSamples());
|
||||||
|
$this->assertEquals($expectedTargets, $dataset->getTargets());
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testSvmDatasetSparse(): void
|
||||||
|
{
|
||||||
|
$filePath = self::getFilePath('sparse.svm');
|
||||||
|
$dataset = new SvmDataset($filePath);
|
||||||
|
|
||||||
|
$expectedSamples = [
|
||||||
|
[0, 3.45, 0, 0, 0],
|
||||||
|
[0, 0, 0, 0, 6.789],
|
||||||
|
];
|
||||||
|
$expectedTargets = [
|
||||||
|
0,
|
||||||
|
1,
|
||||||
|
];
|
||||||
|
|
||||||
|
$this->assertEquals($expectedSamples, $dataset->getSamples());
|
||||||
|
$this->assertEquals($expectedTargets, $dataset->getTargets());
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testSvmDatasetComments(): void
|
||||||
|
{
|
||||||
|
$filePath = self::getFilePath('comments.svm');
|
||||||
|
$dataset = new SvmDataset($filePath);
|
||||||
|
|
||||||
|
$expectedSamples = [
|
||||||
|
[2],
|
||||||
|
[34],
|
||||||
|
];
|
||||||
|
$expectedTargets = [
|
||||||
|
0,
|
||||||
|
1,
|
||||||
|
];
|
||||||
|
|
||||||
|
$this->assertEquals($expectedSamples, $dataset->getSamples());
|
||||||
|
$this->assertEquals($expectedTargets, $dataset->getTargets());
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testSvmDatasetTabs(): void
|
||||||
|
{
|
||||||
|
$filePath = self::getFilePath('tabs.svm');
|
||||||
|
$dataset = new SvmDataset($filePath);
|
||||||
|
|
||||||
|
$expectedSamples = [
|
||||||
|
[23, 45],
|
||||||
|
];
|
||||||
|
$expectedTargets = [
|
||||||
|
1,
|
||||||
|
];
|
||||||
|
|
||||||
|
$this->assertEquals($expectedSamples, $dataset->getSamples());
|
||||||
|
$this->assertEquals($expectedTargets, $dataset->getTargets());
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testSvmDatasetMissingFile(): void
|
||||||
|
{
|
||||||
|
$this->expectException(FileException::class);
|
||||||
|
|
||||||
|
$filePath = self::getFilePath('err_file_not_exists.svm');
|
||||||
|
$dataset = new SvmDataset($filePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testSvmDatasetEmptyLine(): void
|
||||||
|
{
|
||||||
|
$this->expectException(DatasetException::class);
|
||||||
|
|
||||||
|
$filePath = self::getFilePath('err_empty_line.svm');
|
||||||
|
$dataset = new SvmDataset($filePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testSvmDatasetNoLabels(): void
|
||||||
|
{
|
||||||
|
$this->expectException(DatasetException::class);
|
||||||
|
|
||||||
|
$filePath = self::getFilePath('err_no_labels.svm');
|
||||||
|
$dataset = new SvmDataset($filePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testSvmDatasetStringLabels(): void
|
||||||
|
{
|
||||||
|
$this->expectException(DatasetException::class);
|
||||||
|
|
||||||
|
$filePath = self::getFilePath('err_string_labels.svm');
|
||||||
|
$dataset = new SvmDataset($filePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testSvmDatasetInvalidSpaces(): void
|
||||||
|
{
|
||||||
|
$this->expectException(DatasetException::class);
|
||||||
|
|
||||||
|
$filePath = self::getFilePath('err_invalid_spaces.svm');
|
||||||
|
$dataset = new SvmDataset($filePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testSvmDatasetStringIndex(): void
|
||||||
|
{
|
||||||
|
$this->expectException(DatasetException::class);
|
||||||
|
|
||||||
|
$filePath = self::getFilePath('err_string_index.svm');
|
||||||
|
$dataset = new SvmDataset($filePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testSvmDatasetIndexZero(): void
|
||||||
|
{
|
||||||
|
$this->expectException(DatasetException::class);
|
||||||
|
|
||||||
|
$filePath = self::getFilePath('err_index_zero.svm');
|
||||||
|
$dataset = new SvmDataset($filePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testSvmDatasetInvalidValue(): void
|
||||||
|
{
|
||||||
|
$this->expectException(DatasetException::class);
|
||||||
|
|
||||||
|
$filePath = self::getFilePath('err_invalid_value.svm');
|
||||||
|
$dataset = new SvmDataset($filePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testSvmDatasetInvalidFeature(): void
|
||||||
|
{
|
||||||
|
$this->expectException(DatasetException::class);
|
||||||
|
|
||||||
|
$filePath = self::getFilePath('err_invalid_feature.svm');
|
||||||
|
$dataset = new SvmDataset($filePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static function getFilePath(string $baseName): string
|
||||||
|
{
|
||||||
|
return dirname(__FILE__).'/Resources/svm/'.$baseName;
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue