mirror of
https://github.com/Llewellynvdm/php-ml.git
synced 2024-11-13 00:46:29 +00:00
Mnist Dataset (#326)
* Implement MnistDataset * Add MNIST dataset documentation
This commit is contained in:
parent
8ac013b2e4
commit
18c36b971f
@ -112,6 +112,7 @@ Public datasets are available in a separate repository [php-ai/php-ml-datasets](
|
||||
* [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/)
|
||||
* [Files](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/files-dataset/)
|
||||
* [SVM](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/svm-dataset/)
|
||||
* [MNIST](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/mnist-dataset.md)
|
||||
* Ready to use:
|
||||
* [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/)
|
||||
* [Wine](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/wine/)
|
||||
|
@ -93,6 +93,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples](
|
||||
* [CSV](machine-learning/datasets/csv-dataset.md)
|
||||
* [Files](machine-learning/datasets/files-dataset.md)
|
||||
* [SVM](machine-learning/datasets/svm-dataset.md)
|
||||
* [MNIST](machine-learning/datasets/mnist-dataset.md)
|
||||
* Ready to use:
|
||||
* [Iris](machine-learning/datasets/demo/iris.md)
|
||||
* [Wine](machine-learning/datasets/demo/wine.md)
|
||||
|
26
docs/machine-learning/datasets/mnist-dataset.md
Normal file
26
docs/machine-learning/datasets/mnist-dataset.md
Normal file
@ -0,0 +1,26 @@
|
||||
# MnistDataset
|
||||
|
||||
Helper class that load data from MNIST dataset: [http://yann.lecun.com/exdb/mnist/](http://yann.lecun.com/exdb/mnist/)
|
||||
|
||||
> The MNIST database of handwritten digits, available from this page, has a training set of 60,000 examples, and a test set of 10,000 examples. It is a subset of a larger set available from NIST. The digits have been size-normalized and centered in a fixed-size image.
|
||||
It is a good database for people who want to try learning techniques and pattern recognition methods on real-world data while spending minimal efforts on preprocessing and formatting.
|
||||
|
||||
### Constructors Parameters
|
||||
|
||||
* $imagePath - (string) path to image file
|
||||
* $labelPath - (string) path to label file
|
||||
|
||||
```
|
||||
use Phpml\Dataset\MnistDataset;
|
||||
|
||||
$trainDataset = new MnistDataset('train-images-idx3-ubyte', 'train-labels-idx1-ubyte');
|
||||
```
|
||||
|
||||
### Samples and labels
|
||||
|
||||
To get samples or labels you can use getters:
|
||||
|
||||
```
|
||||
$dataset->getSamples();
|
||||
$dataset->getTargets();
|
||||
```
|
@ -39,6 +39,7 @@ pages:
|
||||
- CSV Dataset: machine-learning/datasets/csv-dataset.md
|
||||
- Files Dataset: machine-learning/datasets/files-dataset.md
|
||||
- SVM Dataset: machine-learning/datasets/svm-dataset.md
|
||||
- MNIST Dataset: machine-learning/datasets/mnist-dataset.md
|
||||
- Ready to use datasets:
|
||||
- Iris: machine-learning/datasets/demo/iris.md
|
||||
- Wine: machine-learning/datasets/demo/wine.md
|
||||
|
@ -6,7 +6,7 @@ includes:
|
||||
parameters:
|
||||
ignoreErrors:
|
||||
- '#Property Phpml\\Clustering\\KMeans\\Cluster\:\:\$points \(iterable\<Phpml\\Clustering\\KMeans\\Point\>\&SplObjectStorage\) does not accept SplObjectStorage#'
|
||||
- '#Phpml\\Dataset\\FilesDataset::__construct\(\) does not call parent constructor from Phpml\\Dataset\\ArrayDataset#'
|
||||
- '#Phpml\\Dataset\\(.*)Dataset::__construct\(\) does not call parent constructor from Phpml\\Dataset\\ArrayDataset#'
|
||||
|
||||
# wide range cases
|
||||
- '#Parameter \#1 \$coordinates of class Phpml\\Clustering\\KMeans\\Point constructor expects array, array<int>\|Phpml\\Clustering\\KMeans\\Point given#'
|
||||
|
101
src/Dataset/MnistDataset.php
Normal file
101
src/Dataset/MnistDataset.php
Normal file
@ -0,0 +1,101 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\Dataset;
|
||||
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
|
||||
/**
|
||||
* MNIST dataset: http://yann.lecun.com/exdb/mnist/
|
||||
* original mnist dataset reader: https://github.com/AndrewCarterUK/mnist-neural-network-plain-php
|
||||
*/
|
||||
final class MnistDataset extends ArrayDataset
|
||||
{
|
||||
private const MAGIC_IMAGE = 0x00000803;
|
||||
|
||||
private const MAGIC_LABEL = 0x00000801;
|
||||
|
||||
private const IMAGE_ROWS = 28;
|
||||
|
||||
private const IMAGE_COLS = 28;
|
||||
|
||||
public function __construct(string $imagePath, string $labelPath)
|
||||
{
|
||||
$this->samples = $this->readImages($imagePath);
|
||||
$this->targets = $this->readLabels($labelPath);
|
||||
|
||||
if (count($this->samples) !== count($this->targets)) {
|
||||
throw new InvalidArgumentException('Must have the same number of images and labels');
|
||||
}
|
||||
}
|
||||
|
||||
private function readImages(string $imagePath): array
|
||||
{
|
||||
$stream = fopen($imagePath, 'rb');
|
||||
|
||||
if ($stream === false) {
|
||||
throw new InvalidArgumentException('Could not open file: '.$imagePath);
|
||||
}
|
||||
|
||||
$images = [];
|
||||
|
||||
try {
|
||||
$header = fread($stream, 16);
|
||||
|
||||
$fields = unpack('Nmagic/Nsize/Nrows/Ncols', (string) $header);
|
||||
|
||||
if ($fields['magic'] !== self::MAGIC_IMAGE) {
|
||||
throw new InvalidArgumentException('Invalid magic number: '.$imagePath);
|
||||
}
|
||||
|
||||
if ($fields['rows'] != self::IMAGE_ROWS) {
|
||||
throw new InvalidArgumentException('Invalid number of image rows: '.$imagePath);
|
||||
}
|
||||
|
||||
if ($fields['cols'] != self::IMAGE_COLS) {
|
||||
throw new InvalidArgumentException('Invalid number of image cols: '.$imagePath);
|
||||
}
|
||||
|
||||
for ($i = 0; $i < $fields['size']; $i++) {
|
||||
$imageBytes = fread($stream, $fields['rows'] * $fields['cols']);
|
||||
|
||||
// Convert to float between 0 and 1
|
||||
$images[] = array_map(function ($b) {
|
||||
return $b / 255;
|
||||
}, array_values(unpack('C*', (string) $imageBytes)));
|
||||
}
|
||||
} finally {
|
||||
fclose($stream);
|
||||
}
|
||||
|
||||
return $images;
|
||||
}
|
||||
|
||||
private function readLabels(string $labelPath): array
|
||||
{
|
||||
$stream = fopen($labelPath, 'rb');
|
||||
|
||||
if ($stream === false) {
|
||||
throw new InvalidArgumentException('Could not open file: '.$labelPath);
|
||||
}
|
||||
|
||||
$labels = [];
|
||||
|
||||
try {
|
||||
$header = fread($stream, 8);
|
||||
|
||||
$fields = unpack('Nmagic/Nsize', (string) $header);
|
||||
|
||||
if ($fields['magic'] !== self::MAGIC_LABEL) {
|
||||
throw new InvalidArgumentException('Invalid magic number: '.$labelPath);
|
||||
}
|
||||
|
||||
$labels = fread($stream, $fields['size']);
|
||||
} finally {
|
||||
fclose($stream);
|
||||
}
|
||||
|
||||
return array_values(unpack('C*', (string) $labels));
|
||||
}
|
||||
}
|
33
tests/Dataset/MnistDatasetTest.php
Normal file
33
tests/Dataset/MnistDatasetTest.php
Normal file
@ -0,0 +1,33 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\Tests\Dataset;
|
||||
|
||||
use Phpml\Dataset\MnistDataset;
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use PHPUnit\Framework\TestCase;
|
||||
|
||||
class MnistDatasetTest extends TestCase
|
||||
{
|
||||
public function testSimpleMnistDataset(): void
|
||||
{
|
||||
$dataset = new MnistDataset(
|
||||
__DIR__.'/Resources/mnist/images-idx-ubyte',
|
||||
__DIR__.'/Resources/mnist/labels-idx-ubyte'
|
||||
);
|
||||
|
||||
self::assertCount(10, $dataset->getSamples());
|
||||
self::assertCount(10, $dataset->getTargets());
|
||||
}
|
||||
|
||||
public function testCheckSamplesAndTargetsCountMatch(): void
|
||||
{
|
||||
$this->expectException(InvalidArgumentException::class);
|
||||
|
||||
new MnistDataset(
|
||||
__DIR__.'/Resources/mnist/images-idx-ubyte',
|
||||
__DIR__.'/Resources/mnist/labels-11-idx-ubyte'
|
||||
);
|
||||
}
|
||||
}
|
BIN
tests/Dataset/Resources/mnist/images-idx-ubyte
Normal file
BIN
tests/Dataset/Resources/mnist/images-idx-ubyte
Normal file
Binary file not shown.
BIN
tests/Dataset/Resources/mnist/labels-11-idx-ubyte
Normal file
BIN
tests/Dataset/Resources/mnist/labels-11-idx-ubyte
Normal file
Binary file not shown.
BIN
tests/Dataset/Resources/mnist/labels-idx-ubyte
Normal file
BIN
tests/Dataset/Resources/mnist/labels-idx-ubyte
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user