mirror of
https://github.com/Llewellynvdm/php-ml.git
synced 2024-11-24 13:57:33 +00:00
Add a SvmDataset class for SVM-Light (or LibSVM) format files (#237)
* Add data loader for svm format * Add tests for error cases * Set proper exception messages * Add documents * Add error checking code for invalid column format * Add missing documents
This commit is contained in:
parent
a96f03e8dd
commit
4562f1dfc9
@ -104,6 +104,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples](
|
||||
* [Array](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/array-dataset/)
|
||||
* [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/)
|
||||
* [Files](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/files-dataset/)
|
||||
* [SVM](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/svm-dataset/)
|
||||
* Ready to use:
|
||||
* [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/)
|
||||
* [Wine](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/wine/)
|
||||
|
@ -12,7 +12,7 @@
|
||||
<a href="http://www.yegor256.com/2016/10/23/award-2017.html">
|
||||
<img src="http://www.yegor256.com/images/award/2017/winner-itcraftsmanpl.png"
|
||||
style="width:203px;height:45px;"/></a>
|
||||
|
||||
|
||||
![PHP-ML - Machine Learning library for PHP](assets/php-ml-logo.png)
|
||||
|
||||
Fresh approach to Machine Learning in PHP. Algorithms, Cross Validation, Neural Network, Preprocessing, Feature Extraction and much more in one library.
|
||||
@ -31,7 +31,7 @@ $labels = ['a', 'a', 'a', 'b', 'b', 'b'];
|
||||
$classifier = new KNearestNeighbors();
|
||||
$classifier->train($samples, $labels);
|
||||
|
||||
$classifier->predict([3, 2]);
|
||||
$classifier->predict([3, 2]);
|
||||
// return 'b'
|
||||
```
|
||||
|
||||
@ -89,6 +89,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples](
|
||||
* [Array](machine-learning/datasets/array-dataset.md)
|
||||
* [CSV](machine-learning/datasets/csv-dataset.md)
|
||||
* [Files](machine-learning/datasets/files-dataset.md)
|
||||
* [SVM](machine-learning/datasets/svm-dataset.md)
|
||||
* Ready to use:
|
||||
* [Iris](machine-learning/datasets/demo/iris.md)
|
||||
* [Wine](machine-learning/datasets/demo/wine.md)
|
||||
@ -100,7 +101,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples](
|
||||
* [Matrix](math/matrix.md)
|
||||
* [Set](math/set.md)
|
||||
* [Statistic](math/statistic.md)
|
||||
|
||||
|
||||
|
||||
## Contribute
|
||||
|
||||
|
13
docs/machine-learning/datasets/svm-dataset.md
Normal file
13
docs/machine-learning/datasets/svm-dataset.md
Normal file
@ -0,0 +1,13 @@
|
||||
# SvmDataset
|
||||
|
||||
Helper class that loads data from SVM-Light format file. It extends the `ArrayDataset`.
|
||||
|
||||
### Constructors Parameters
|
||||
|
||||
* $filepath - (string) path to the file
|
||||
|
||||
```
|
||||
$dataset = new SvmDataset('dataset.svm');
|
||||
```
|
||||
|
||||
See [ArrayDataset](array-dataset.md) for more information.
|
@ -38,6 +38,7 @@ pages:
|
||||
- Array Dataset: machine-learning/datasets/array-dataset.md
|
||||
- CSV Dataset: machine-learning/datasets/csv-dataset.md
|
||||
- Files Dataset: machine-learning/datasets/files-dataset.md
|
||||
- SVM Dataset: machine-learning/datasets/svm-dataset.md
|
||||
- Ready to use datasets:
|
||||
- Iris: machine-learning/datasets/demo/iris.md
|
||||
- Wine: machine-learning/datasets/demo/wine.md
|
||||
|
130
src/Dataset/SvmDataset.php
Normal file
130
src/Dataset/SvmDataset.php
Normal file
@ -0,0 +1,130 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\Dataset;
|
||||
|
||||
use Phpml\Exception\DatasetException;
|
||||
use Phpml\Exception\FileException;
|
||||
|
||||
class SvmDataset extends ArrayDataset
|
||||
{
|
||||
public function __construct(string $filePath)
|
||||
{
|
||||
[$samples, $targets] = self::readProblem($filePath);
|
||||
|
||||
parent::__construct($samples, $targets);
|
||||
}
|
||||
|
||||
private static function readProblem(string $filePath): array
|
||||
{
|
||||
$handle = self::openFile($filePath);
|
||||
|
||||
$samples = [];
|
||||
$targets = [];
|
||||
$maxIndex = 0;
|
||||
while (($line = fgets($handle)) !== false) {
|
||||
[$sample, $target, $maxIndex] = self::processLine($line, $maxIndex);
|
||||
$samples[] = $sample;
|
||||
$targets[] = $target;
|
||||
}
|
||||
|
||||
fclose($handle);
|
||||
|
||||
foreach ($samples as &$sample) {
|
||||
$sample = array_pad($sample, $maxIndex + 1, 0);
|
||||
}
|
||||
|
||||
return [$samples, $targets];
|
||||
}
|
||||
|
||||
private static function openFile(string $filePath)
|
||||
{
|
||||
if (!file_exists($filePath)) {
|
||||
throw FileException::missingFile(basename($filePath));
|
||||
}
|
||||
|
||||
$handle = fopen($filePath, 'rb');
|
||||
if ($handle === false) {
|
||||
throw FileException::cantOpenFile(basename($filePath));
|
||||
}
|
||||
|
||||
return $handle;
|
||||
}
|
||||
|
||||
private static function processLine(string $line, int $maxIndex): array
|
||||
{
|
||||
$columns = self::parseLine($line);
|
||||
|
||||
$target = self::parseTargetColumn($columns[0]);
|
||||
$sample = array_fill(0, $maxIndex + 1, 0);
|
||||
|
||||
$n = count($columns);
|
||||
for ($i = 1; $i < $n; ++$i) {
|
||||
[$index, $value] = self::parseFeatureColumn($columns[$i]);
|
||||
if ($index > $maxIndex) {
|
||||
$maxIndex = $index;
|
||||
$sample = array_pad($sample, $maxIndex + 1, 0);
|
||||
}
|
||||
|
||||
$sample[$index] = $value;
|
||||
}
|
||||
|
||||
return [$sample, $target, $maxIndex];
|
||||
}
|
||||
|
||||
private static function parseLine(string $line): array
|
||||
{
|
||||
$line = explode('#', $line, 2)[0];
|
||||
$line = rtrim($line);
|
||||
$line = str_replace("\t", ' ', $line);
|
||||
|
||||
$columns = explode(' ', $line);
|
||||
|
||||
return $columns;
|
||||
}
|
||||
|
||||
private static function parseTargetColumn(string $column): float
|
||||
{
|
||||
if (!is_numeric($column)) {
|
||||
throw DatasetException::invalidTarget($column);
|
||||
}
|
||||
|
||||
return (float) $column;
|
||||
}
|
||||
|
||||
private static function parseFeatureColumn(string $column): array
|
||||
{
|
||||
$feature = explode(':', $column, 2);
|
||||
if (count($feature) != 2) {
|
||||
throw DatasetException::invalidValue($column);
|
||||
}
|
||||
|
||||
$index = self::parseFeatureIndex($feature[0]);
|
||||
$value = self::parseFeatureValue($feature[1]);
|
||||
|
||||
return [$index, $value];
|
||||
}
|
||||
|
||||
private static function parseFeatureIndex(string $index): int
|
||||
{
|
||||
if (!is_numeric($index) || !ctype_digit($index)) {
|
||||
throw DatasetException::invalidIndex($index);
|
||||
}
|
||||
|
||||
if ((int) $index < 1) {
|
||||
throw DatasetException::invalidIndex($index);
|
||||
}
|
||||
|
||||
return (int) $index - 1;
|
||||
}
|
||||
|
||||
private static function parseFeatureValue(string $value): float
|
||||
{
|
||||
if (!is_numeric($value)) {
|
||||
throw DatasetException::invalidValue($value);
|
||||
}
|
||||
|
||||
return (float) $value;
|
||||
}
|
||||
}
|
@ -12,4 +12,19 @@ class DatasetException extends Exception
|
||||
{
|
||||
return new self(sprintf('Dataset root folder "%s" missing.', $path));
|
||||
}
|
||||
|
||||
public static function invalidTarget(string $target): self
|
||||
{
|
||||
return new self(sprintf('Invalid target "%s".', $target));
|
||||
}
|
||||
|
||||
public static function invalidIndex(string $index): self
|
||||
{
|
||||
return new self(sprintf('Invalid index "%s".', $index));
|
||||
}
|
||||
|
||||
public static function invalidValue(string $value): self
|
||||
{
|
||||
return new self(sprintf('Invalid value "%s".', $value));
|
||||
}
|
||||
}
|
||||
|
1
tests/Dataset/Resources/svm/1x1.svm
Normal file
1
tests/Dataset/Resources/svm/1x1.svm
Normal file
@ -0,0 +1 @@
|
||||
0 1:2.3
|
3
tests/Dataset/Resources/svm/3x1.svm
Normal file
3
tests/Dataset/Resources/svm/3x1.svm
Normal file
@ -0,0 +1,3 @@
|
||||
1 1:2.3
|
||||
0 1:4.56
|
||||
1 1:78.9
|
3
tests/Dataset/Resources/svm/3x4.svm
Normal file
3
tests/Dataset/Resources/svm/3x4.svm
Normal file
@ -0,0 +1,3 @@
|
||||
1 1:2 2:4 3:6 4:8
|
||||
2 1:3 2:5 3:7 4:9
|
||||
0 1:1.2 2:3.4 3:5.6 4:7.8
|
2
tests/Dataset/Resources/svm/comments.svm
Normal file
2
tests/Dataset/Resources/svm/comments.svm
Normal file
@ -0,0 +1,2 @@
|
||||
0 1:2 # This is a comment.
|
||||
1 1:34 # This # is # : # also # a # comment # .
|
0
tests/Dataset/Resources/svm/empty.svm
Normal file
0
tests/Dataset/Resources/svm/empty.svm
Normal file
3
tests/Dataset/Resources/svm/err_empty_line.svm
Normal file
3
tests/Dataset/Resources/svm/err_empty_line.svm
Normal file
@ -0,0 +1,3 @@
|
||||
1 1:2.3
|
||||
|
||||
0 1:4.56
|
1
tests/Dataset/Resources/svm/err_index_zero.svm
Normal file
1
tests/Dataset/Resources/svm/err_index_zero.svm
Normal file
@ -0,0 +1 @@
|
||||
0 0:2.3
|
1
tests/Dataset/Resources/svm/err_invalid_feature.svm
Normal file
1
tests/Dataset/Resources/svm/err_invalid_feature.svm
Normal file
@ -0,0 +1 @@
|
||||
0 12345
|
1
tests/Dataset/Resources/svm/err_invalid_spaces.svm
Normal file
1
tests/Dataset/Resources/svm/err_invalid_spaces.svm
Normal file
@ -0,0 +1 @@
|
||||
0 1:2.3
|
1
tests/Dataset/Resources/svm/err_invalid_value.svm
Normal file
1
tests/Dataset/Resources/svm/err_invalid_value.svm
Normal file
@ -0,0 +1 @@
|
||||
0 1:xyz
|
1
tests/Dataset/Resources/svm/err_no_labels.svm
Normal file
1
tests/Dataset/Resources/svm/err_no_labels.svm
Normal file
@ -0,0 +1 @@
|
||||
1:2.3
|
1
tests/Dataset/Resources/svm/err_string_index.svm
Normal file
1
tests/Dataset/Resources/svm/err_string_index.svm
Normal file
@ -0,0 +1 @@
|
||||
0 x:2.3
|
1
tests/Dataset/Resources/svm/err_string_labels.svm
Normal file
1
tests/Dataset/Resources/svm/err_string_labels.svm
Normal file
@ -0,0 +1 @@
|
||||
A 1:2.3
|
2
tests/Dataset/Resources/svm/sparse.svm
Normal file
2
tests/Dataset/Resources/svm/sparse.svm
Normal file
@ -0,0 +1,2 @@
|
||||
0 2:3.45
|
||||
1 5:6.789
|
1
tests/Dataset/Resources/svm/tabs.svm
Normal file
1
tests/Dataset/Resources/svm/tabs.svm
Normal file
@ -0,0 +1 @@
|
||||
1 1:23 2:45 # comments
|
212
tests/Dataset/SvmDatasetTest.php
Normal file
212
tests/Dataset/SvmDatasetTest.php
Normal file
@ -0,0 +1,212 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\Tests\Dataset;
|
||||
|
||||
use Phpml\Dataset\SvmDataset;
|
||||
use Phpml\Exception\DatasetException;
|
||||
use Phpml\Exception\FileException;
|
||||
use PHPUnit\Framework\TestCase;
|
||||
|
||||
class SvmDatasetTest extends TestCase
|
||||
{
|
||||
public function testSvmDatasetEmpty(): void
|
||||
{
|
||||
$filePath = self::getFilePath('empty.svm');
|
||||
$dataset = new SvmDataset($filePath);
|
||||
|
||||
$expectedSamples = [
|
||||
];
|
||||
$expectedTargets = [
|
||||
];
|
||||
|
||||
$this->assertEquals($expectedSamples, $dataset->getSamples());
|
||||
$this->assertEquals($expectedTargets, $dataset->getTargets());
|
||||
}
|
||||
|
||||
public function testSvmDataset1x1(): void
|
||||
{
|
||||
$filePath = self::getFilePath('1x1.svm');
|
||||
$dataset = new SvmDataset($filePath);
|
||||
|
||||
$expectedSamples = [
|
||||
[2.3],
|
||||
];
|
||||
$expectedTargets = [
|
||||
0,
|
||||
];
|
||||
|
||||
$this->assertEquals($expectedSamples, $dataset->getSamples());
|
||||
$this->assertEquals($expectedTargets, $dataset->getTargets());
|
||||
}
|
||||
|
||||
public function testSvmDataset3x1(): void
|
||||
{
|
||||
$filePath = self::getFilePath('3x1.svm');
|
||||
$dataset = new SvmDataset($filePath);
|
||||
|
||||
$expectedSamples = [
|
||||
[2.3],
|
||||
[4.56],
|
||||
[78.9],
|
||||
];
|
||||
$expectedTargets = [
|
||||
1,
|
||||
0,
|
||||
1,
|
||||
];
|
||||
|
||||
$this->assertEquals($expectedSamples, $dataset->getSamples());
|
||||
$this->assertEquals($expectedTargets, $dataset->getTargets());
|
||||
}
|
||||
|
||||
public function testSvmDataset3x4(): void
|
||||
{
|
||||
$filePath = self::getFilePath('3x4.svm');
|
||||
$dataset = new SvmDataset($filePath);
|
||||
|
||||
$expectedSamples = [
|
||||
[2, 4, 6, 8],
|
||||
[3, 5, 7, 9],
|
||||
[1.2, 3.4, 5.6, 7.8],
|
||||
];
|
||||
$expectedTargets = [
|
||||
1,
|
||||
2,
|
||||
0,
|
||||
];
|
||||
|
||||
$this->assertEquals($expectedSamples, $dataset->getSamples());
|
||||
$this->assertEquals($expectedTargets, $dataset->getTargets());
|
||||
}
|
||||
|
||||
public function testSvmDatasetSparse(): void
|
||||
{
|
||||
$filePath = self::getFilePath('sparse.svm');
|
||||
$dataset = new SvmDataset($filePath);
|
||||
|
||||
$expectedSamples = [
|
||||
[0, 3.45, 0, 0, 0],
|
||||
[0, 0, 0, 0, 6.789],
|
||||
];
|
||||
$expectedTargets = [
|
||||
0,
|
||||
1,
|
||||
];
|
||||
|
||||
$this->assertEquals($expectedSamples, $dataset->getSamples());
|
||||
$this->assertEquals($expectedTargets, $dataset->getTargets());
|
||||
}
|
||||
|
||||
public function testSvmDatasetComments(): void
|
||||
{
|
||||
$filePath = self::getFilePath('comments.svm');
|
||||
$dataset = new SvmDataset($filePath);
|
||||
|
||||
$expectedSamples = [
|
||||
[2],
|
||||
[34],
|
||||
];
|
||||
$expectedTargets = [
|
||||
0,
|
||||
1,
|
||||
];
|
||||
|
||||
$this->assertEquals($expectedSamples, $dataset->getSamples());
|
||||
$this->assertEquals($expectedTargets, $dataset->getTargets());
|
||||
}
|
||||
|
||||
public function testSvmDatasetTabs(): void
|
||||
{
|
||||
$filePath = self::getFilePath('tabs.svm');
|
||||
$dataset = new SvmDataset($filePath);
|
||||
|
||||
$expectedSamples = [
|
||||
[23, 45],
|
||||
];
|
||||
$expectedTargets = [
|
||||
1,
|
||||
];
|
||||
|
||||
$this->assertEquals($expectedSamples, $dataset->getSamples());
|
||||
$this->assertEquals($expectedTargets, $dataset->getTargets());
|
||||
}
|
||||
|
||||
public function testSvmDatasetMissingFile(): void
|
||||
{
|
||||
$this->expectException(FileException::class);
|
||||
|
||||
$filePath = self::getFilePath('err_file_not_exists.svm');
|
||||
$dataset = new SvmDataset($filePath);
|
||||
}
|
||||
|
||||
public function testSvmDatasetEmptyLine(): void
|
||||
{
|
||||
$this->expectException(DatasetException::class);
|
||||
|
||||
$filePath = self::getFilePath('err_empty_line.svm');
|
||||
$dataset = new SvmDataset($filePath);
|
||||
}
|
||||
|
||||
public function testSvmDatasetNoLabels(): void
|
||||
{
|
||||
$this->expectException(DatasetException::class);
|
||||
|
||||
$filePath = self::getFilePath('err_no_labels.svm');
|
||||
$dataset = new SvmDataset($filePath);
|
||||
}
|
||||
|
||||
public function testSvmDatasetStringLabels(): void
|
||||
{
|
||||
$this->expectException(DatasetException::class);
|
||||
|
||||
$filePath = self::getFilePath('err_string_labels.svm');
|
||||
$dataset = new SvmDataset($filePath);
|
||||
}
|
||||
|
||||
public function testSvmDatasetInvalidSpaces(): void
|
||||
{
|
||||
$this->expectException(DatasetException::class);
|
||||
|
||||
$filePath = self::getFilePath('err_invalid_spaces.svm');
|
||||
$dataset = new SvmDataset($filePath);
|
||||
}
|
||||
|
||||
public function testSvmDatasetStringIndex(): void
|
||||
{
|
||||
$this->expectException(DatasetException::class);
|
||||
|
||||
$filePath = self::getFilePath('err_string_index.svm');
|
||||
$dataset = new SvmDataset($filePath);
|
||||
}
|
||||
|
||||
public function testSvmDatasetIndexZero(): void
|
||||
{
|
||||
$this->expectException(DatasetException::class);
|
||||
|
||||
$filePath = self::getFilePath('err_index_zero.svm');
|
||||
$dataset = new SvmDataset($filePath);
|
||||
}
|
||||
|
||||
public function testSvmDatasetInvalidValue(): void
|
||||
{
|
||||
$this->expectException(DatasetException::class);
|
||||
|
||||
$filePath = self::getFilePath('err_invalid_value.svm');
|
||||
$dataset = new SvmDataset($filePath);
|
||||
}
|
||||
|
||||
public function testSvmDatasetInvalidFeature(): void
|
||||
{
|
||||
$this->expectException(DatasetException::class);
|
||||
|
||||
$filePath = self::getFilePath('err_invalid_feature.svm');
|
||||
$dataset = new SvmDataset($filePath);
|
||||
}
|
||||
|
||||
private static function getFilePath(string $baseName): string
|
||||
{
|
||||
return dirname(__FILE__).'/Resources/svm/'.$baseName;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user