Add a SvmDataset class for SVM-Light (or LibSVM) format files (#237)

* Add data loader for svm format

* Add tests for error cases

* Set proper exception messages

* Add documents

* Add error checking code for invalid column format

* Add missing documents
This commit is contained in:
Yuji Uchiyama 2018-02-24 19:17:35 +09:00 committed by Arkadiusz Kondas
parent a96f03e8dd
commit 4562f1dfc9
22 changed files with 398 additions and 3 deletions

View File

@ -104,6 +104,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples](
* [Array](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/array-dataset/)
* [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/)
* [Files](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/files-dataset/)
* [SVM](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/svm-dataset/)
* Ready to use:
* [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/)
* [Wine](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/wine/)

View File

@ -12,7 +12,7 @@
<a href="http://www.yegor256.com/2016/10/23/award-2017.html">
<img src="http://www.yegor256.com/images/award/2017/winner-itcraftsmanpl.png"
style="width:203px;height:45px;"/></a>
![PHP-ML - Machine Learning library for PHP](assets/php-ml-logo.png)
Fresh approach to Machine Learning in PHP. Algorithms, Cross Validation, Neural Network, Preprocessing, Feature Extraction and much more in one library.
@ -31,7 +31,7 @@ $labels = ['a', 'a', 'a', 'b', 'b', 'b'];
$classifier = new KNearestNeighbors();
$classifier->train($samples, $labels);
$classifier->predict([3, 2]);
$classifier->predict([3, 2]);
// return 'b'
```
@ -89,6 +89,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples](
* [Array](machine-learning/datasets/array-dataset.md)
* [CSV](machine-learning/datasets/csv-dataset.md)
* [Files](machine-learning/datasets/files-dataset.md)
* [SVM](machine-learning/datasets/svm-dataset.md)
* Ready to use:
* [Iris](machine-learning/datasets/demo/iris.md)
* [Wine](machine-learning/datasets/demo/wine.md)
@ -100,7 +101,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples](
* [Matrix](math/matrix.md)
* [Set](math/set.md)
* [Statistic](math/statistic.md)
## Contribute

View File

@ -0,0 +1,13 @@
# SvmDataset
Helper class that loads data from SVM-Light format file. It extends the `ArrayDataset`.
### Constructors Parameters
* $filepath - (string) path to the file
```
$dataset = new SvmDataset('dataset.svm');
```
See [ArrayDataset](array-dataset.md) for more information.

View File

@ -38,6 +38,7 @@ pages:
- Array Dataset: machine-learning/datasets/array-dataset.md
- CSV Dataset: machine-learning/datasets/csv-dataset.md
- Files Dataset: machine-learning/datasets/files-dataset.md
- SVM Dataset: machine-learning/datasets/svm-dataset.md
- Ready to use datasets:
- Iris: machine-learning/datasets/demo/iris.md
- Wine: machine-learning/datasets/demo/wine.md

130
src/Dataset/SvmDataset.php Normal file
View File

@ -0,0 +1,130 @@
<?php
declare(strict_types=1);
namespace Phpml\Dataset;
use Phpml\Exception\DatasetException;
use Phpml\Exception\FileException;
class SvmDataset extends ArrayDataset
{
public function __construct(string $filePath)
{
[$samples, $targets] = self::readProblem($filePath);
parent::__construct($samples, $targets);
}
private static function readProblem(string $filePath): array
{
$handle = self::openFile($filePath);
$samples = [];
$targets = [];
$maxIndex = 0;
while (($line = fgets($handle)) !== false) {
[$sample, $target, $maxIndex] = self::processLine($line, $maxIndex);
$samples[] = $sample;
$targets[] = $target;
}
fclose($handle);
foreach ($samples as &$sample) {
$sample = array_pad($sample, $maxIndex + 1, 0);
}
return [$samples, $targets];
}
private static function openFile(string $filePath)
{
if (!file_exists($filePath)) {
throw FileException::missingFile(basename($filePath));
}
$handle = fopen($filePath, 'rb');
if ($handle === false) {
throw FileException::cantOpenFile(basename($filePath));
}
return $handle;
}
private static function processLine(string $line, int $maxIndex): array
{
$columns = self::parseLine($line);
$target = self::parseTargetColumn($columns[0]);
$sample = array_fill(0, $maxIndex + 1, 0);
$n = count($columns);
for ($i = 1; $i < $n; ++$i) {
[$index, $value] = self::parseFeatureColumn($columns[$i]);
if ($index > $maxIndex) {
$maxIndex = $index;
$sample = array_pad($sample, $maxIndex + 1, 0);
}
$sample[$index] = $value;
}
return [$sample, $target, $maxIndex];
}
private static function parseLine(string $line): array
{
$line = explode('#', $line, 2)[0];
$line = rtrim($line);
$line = str_replace("\t", ' ', $line);
$columns = explode(' ', $line);
return $columns;
}
private static function parseTargetColumn(string $column): float
{
if (!is_numeric($column)) {
throw DatasetException::invalidTarget($column);
}
return (float) $column;
}
private static function parseFeatureColumn(string $column): array
{
$feature = explode(':', $column, 2);
if (count($feature) != 2) {
throw DatasetException::invalidValue($column);
}
$index = self::parseFeatureIndex($feature[0]);
$value = self::parseFeatureValue($feature[1]);
return [$index, $value];
}
private static function parseFeatureIndex(string $index): int
{
if (!is_numeric($index) || !ctype_digit($index)) {
throw DatasetException::invalidIndex($index);
}
if ((int) $index < 1) {
throw DatasetException::invalidIndex($index);
}
return (int) $index - 1;
}
private static function parseFeatureValue(string $value): float
{
if (!is_numeric($value)) {
throw DatasetException::invalidValue($value);
}
return (float) $value;
}
}

View File

@ -12,4 +12,19 @@ class DatasetException extends Exception
{
return new self(sprintf('Dataset root folder "%s" missing.', $path));
}
public static function invalidTarget(string $target): self
{
return new self(sprintf('Invalid target "%s".', $target));
}
public static function invalidIndex(string $index): self
{
return new self(sprintf('Invalid index "%s".', $index));
}
public static function invalidValue(string $value): self
{
return new self(sprintf('Invalid value "%s".', $value));
}
}

View File

@ -0,0 +1 @@
0 1:2.3

View File

@ -0,0 +1,3 @@
1 1:2.3
0 1:4.56
1 1:78.9

View File

@ -0,0 +1,3 @@
1 1:2 2:4 3:6 4:8
2 1:3 2:5 3:7 4:9
0 1:1.2 2:3.4 3:5.6 4:7.8

View File

@ -0,0 +1,2 @@
0 1:2 # This is a comment.
1 1:34 # This # is # : # also # a # comment # .

View File

View File

@ -0,0 +1,3 @@
1 1:2.3
0 1:4.56

View File

@ -0,0 +1 @@
0 0:2.3

View File

@ -0,0 +1 @@
0 12345

View File

@ -0,0 +1 @@
0 1:2.3

View File

@ -0,0 +1 @@
0 1:xyz

View File

@ -0,0 +1 @@
1:2.3

View File

@ -0,0 +1 @@
0 x:2.3

View File

@ -0,0 +1 @@
A 1:2.3

View File

@ -0,0 +1,2 @@
0 2:3.45
1 5:6.789

View File

@ -0,0 +1 @@
1 1:23 2:45 # comments

View File

@ -0,0 +1,212 @@
<?php
declare(strict_types=1);
namespace Phpml\Tests\Dataset;
use Phpml\Dataset\SvmDataset;
use Phpml\Exception\DatasetException;
use Phpml\Exception\FileException;
use PHPUnit\Framework\TestCase;
class SvmDatasetTest extends TestCase
{
public function testSvmDatasetEmpty(): void
{
$filePath = self::getFilePath('empty.svm');
$dataset = new SvmDataset($filePath);
$expectedSamples = [
];
$expectedTargets = [
];
$this->assertEquals($expectedSamples, $dataset->getSamples());
$this->assertEquals($expectedTargets, $dataset->getTargets());
}
public function testSvmDataset1x1(): void
{
$filePath = self::getFilePath('1x1.svm');
$dataset = new SvmDataset($filePath);
$expectedSamples = [
[2.3],
];
$expectedTargets = [
0,
];
$this->assertEquals($expectedSamples, $dataset->getSamples());
$this->assertEquals($expectedTargets, $dataset->getTargets());
}
public function testSvmDataset3x1(): void
{
$filePath = self::getFilePath('3x1.svm');
$dataset = new SvmDataset($filePath);
$expectedSamples = [
[2.3],
[4.56],
[78.9],
];
$expectedTargets = [
1,
0,
1,
];
$this->assertEquals($expectedSamples, $dataset->getSamples());
$this->assertEquals($expectedTargets, $dataset->getTargets());
}
public function testSvmDataset3x4(): void
{
$filePath = self::getFilePath('3x4.svm');
$dataset = new SvmDataset($filePath);
$expectedSamples = [
[2, 4, 6, 8],
[3, 5, 7, 9],
[1.2, 3.4, 5.6, 7.8],
];
$expectedTargets = [
1,
2,
0,
];
$this->assertEquals($expectedSamples, $dataset->getSamples());
$this->assertEquals($expectedTargets, $dataset->getTargets());
}
public function testSvmDatasetSparse(): void
{
$filePath = self::getFilePath('sparse.svm');
$dataset = new SvmDataset($filePath);
$expectedSamples = [
[0, 3.45, 0, 0, 0],
[0, 0, 0, 0, 6.789],
];
$expectedTargets = [
0,
1,
];
$this->assertEquals($expectedSamples, $dataset->getSamples());
$this->assertEquals($expectedTargets, $dataset->getTargets());
}
public function testSvmDatasetComments(): void
{
$filePath = self::getFilePath('comments.svm');
$dataset = new SvmDataset($filePath);
$expectedSamples = [
[2],
[34],
];
$expectedTargets = [
0,
1,
];
$this->assertEquals($expectedSamples, $dataset->getSamples());
$this->assertEquals($expectedTargets, $dataset->getTargets());
}
public function testSvmDatasetTabs(): void
{
$filePath = self::getFilePath('tabs.svm');
$dataset = new SvmDataset($filePath);
$expectedSamples = [
[23, 45],
];
$expectedTargets = [
1,
];
$this->assertEquals($expectedSamples, $dataset->getSamples());
$this->assertEquals($expectedTargets, $dataset->getTargets());
}
public function testSvmDatasetMissingFile(): void
{
$this->expectException(FileException::class);
$filePath = self::getFilePath('err_file_not_exists.svm');
$dataset = new SvmDataset($filePath);
}
public function testSvmDatasetEmptyLine(): void
{
$this->expectException(DatasetException::class);
$filePath = self::getFilePath('err_empty_line.svm');
$dataset = new SvmDataset($filePath);
}
public function testSvmDatasetNoLabels(): void
{
$this->expectException(DatasetException::class);
$filePath = self::getFilePath('err_no_labels.svm');
$dataset = new SvmDataset($filePath);
}
public function testSvmDatasetStringLabels(): void
{
$this->expectException(DatasetException::class);
$filePath = self::getFilePath('err_string_labels.svm');
$dataset = new SvmDataset($filePath);
}
public function testSvmDatasetInvalidSpaces(): void
{
$this->expectException(DatasetException::class);
$filePath = self::getFilePath('err_invalid_spaces.svm');
$dataset = new SvmDataset($filePath);
}
public function testSvmDatasetStringIndex(): void
{
$this->expectException(DatasetException::class);
$filePath = self::getFilePath('err_string_index.svm');
$dataset = new SvmDataset($filePath);
}
public function testSvmDatasetIndexZero(): void
{
$this->expectException(DatasetException::class);
$filePath = self::getFilePath('err_index_zero.svm');
$dataset = new SvmDataset($filePath);
}
public function testSvmDatasetInvalidValue(): void
{
$this->expectException(DatasetException::class);
$filePath = self::getFilePath('err_invalid_value.svm');
$dataset = new SvmDataset($filePath);
}
public function testSvmDatasetInvalidFeature(): void
{
$this->expectException(DatasetException::class);
$filePath = self::getFilePath('err_invalid_feature.svm');
$dataset = new SvmDataset($filePath);
}
private static function getFilePath(string $baseName): string
{
return dirname(__FILE__).'/Resources/svm/'.$baseName;
}
}