Implement LabelEncoder (#369)

This commit is contained in:
Arkadiusz Kondas 2019-04-02 11:07:00 +02:00 committed by GitHub
parent d3888efa7a
commit dbbce0e066
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 121 additions and 0 deletions

View File

@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
### Added
- [Preprocessing] Implement LabelEncoder
## [0.8.0] - 2019-03-20
### Added
- [Tokenization] Added NGramTokenizer (#350)

View File

@ -100,6 +100,7 @@ Public datasets are available in a separate repository [php-ai/php-ml-datasets](
* Preprocessing
* [Normalization](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/normalization/)
* [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values/)
* LabelEncoder
* Feature Extraction
* [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/)
* NGramTokenizer

View File

@ -85,6 +85,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples](
* Preprocessing
* [Normalization](machine-learning/preprocessing/normalization.md)
* [Imputation missing values](machine-learning/preprocessing/imputation-missing-values.md)
* LabelEncoder
* Feature Extraction
* [Token Count Vectorizer](machine-learning/feature-extraction/token-count-vectorizer.md)
* [Tf-idf Transformer](machine-learning/feature-extraction/tf-idf-transformer.md)

View File

@ -0,0 +1,47 @@
<?php
declare(strict_types=1);
namespace Phpml\Preprocessing;
final class LabelEncoder implements Preprocessor
{
/**
* @var int[]
*/
private $classes = [];
public function fit(array $samples, ?array $targets = null): void
{
$this->classes = [];
foreach ($samples as $sample) {
if (!isset($this->classes[(string) $sample])) {
$this->classes[(string) $sample] = count($this->classes);
}
}
}
public function transform(array &$samples): void
{
foreach ($samples as &$sample) {
$sample = $this->classes[(string) $sample];
}
}
public function inverseTransform(array &$samples): void
{
$classes = array_flip($this->classes);
foreach ($samples as &$sample) {
$sample = $classes[$sample];
}
}
/**
* @return string[]
*/
public function classes(): array
{
return array_keys($this->classes);
}
}

View File

@ -0,0 +1,68 @@
<?php
declare(strict_types=1);
namespace Phpml\Tests\Preprocessing;
use Phpml\Preprocessing\LabelEncoder;
use PHPUnit\Framework\TestCase;
final class LabelEncoderTest extends TestCase
{
/**
* @dataProvider labelEncoderDataProvider
*/
public function testFitAndTransform(array $samples, array $transformed): void
{
$le = new LabelEncoder();
$le->fit($samples);
$le->transform($samples);
self::assertEquals($transformed, $samples);
}
public function labelEncoderDataProvider(): array
{
return [
[['one', 'one', 'two', 'three'], [0, 0, 1, 2]],
[['one', 1, 'two', 'three'], [0, 1, 2, 3]],
[['one', null, 'two', 'three'], [0, 1, 2, 3]],
[['one', 'one', 'one', 'one'], [0, 0, 0, 0]],
[['one', 'one', 'one', 'one', null, null, 1, 1, 2, 'two'], [0, 0, 0, 0, 1, 1, 2, 2, 3, 4]],
];
}
public function testResetClassesAfterNextFit(): void
{
$samples = ['Shanghai', 'Beijing', 'Karachi'];
$le = new LabelEncoder();
$le->fit($samples);
self::assertEquals(['Shanghai', 'Beijing', 'Karachi'], $le->classes());
$samples = ['Istanbul', 'Dhaka', 'Tokyo'];
$le->fit($samples);
self::assertEquals(['Istanbul', 'Dhaka', 'Tokyo'], $le->classes());
}
public function testFitAndTransformFullCycle(): void
{
$samples = ['Shanghai', 'Beijing', 'Karachi', 'Beijing', 'Beijing', 'Karachi'];
$encoded = [0, 1, 2, 1, 1, 2];
$le = new LabelEncoder();
$le->fit($samples);
self::assertEquals(['Shanghai', 'Beijing', 'Karachi'], $le->classes());
$transformed = $samples;
$le->transform($transformed);
self::assertEquals($encoded, $transformed);
$le->inverseTransform($transformed);
self::assertEquals($samples, $transformed);
}
}