diff --git a/CHANGELOG.md b/CHANGELOG.md index 662a086..b403887 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,9 +4,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [0.9.0] - Unreleased ### Added - [Preprocessing] Implement LabelEncoder +- [Preprocessing] Implement ColumnFilter +- [Preprocessing] Implement LambdaTransformer +- [Preprocessing] Implement NumberConverter +- [Preprocessing] Implement OneHotEncoder +- [Workflow] Implement FeatureUnion +- [Metric] Add Regression metrics: meanSquaredError, meanSquaredLogarithmicError, meanAbsoluteError, medianAbsoluteError, r2Score, maxError +- [Regression] Implement DecisionTreeRegressor ## [0.8.0] - 2019-03-20 ### Added diff --git a/README.md b/README.md index 3fef8ed..f34a49a 100644 --- a/README.md +++ b/README.md @@ -107,6 +107,7 @@ Public datasets are available in a separate repository [php-ai/php-ml-datasets]( * LambdaTransformer * NumberConverter * ColumnFilter + * OneHotEncoder * Feature Extraction * [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/) * NGramTokenizer diff --git a/src/Preprocessing/OneHotEncoder.php b/src/Preprocessing/OneHotEncoder.php new file mode 100644 index 0000000..c9d4d0a --- /dev/null +++ b/src/Preprocessing/OneHotEncoder.php @@ -0,0 +1,66 @@ +ignoreUnknown = $ignoreUnknown; + } + + public function fit(array $samples, ?array $targets = null): void + { + foreach (array_keys(array_values(current($samples))) as $column) { + $this->fitColumn($column, array_values(array_unique(array_column($samples, $column)))); + } + } + + public function transform(array &$samples, ?array &$targets = null): void + { + foreach ($samples as &$sample) { + $sample = $this->transformSample(array_values($sample)); + } + } + + private function fitColumn(int $column, array $values): void + { + $count = count($values); + foreach ($values as $index => $value) { + $map = array_fill(0, $count, 0); + $map[$index] = 1; + $this->categories[$column][$value] = $map; + } + } + + private function transformSample(array $sample): array + { + $encoded = []; + foreach ($sample as $column => $feature) { + if (!isset($this->categories[$column][$feature]) && !$this->ignoreUnknown) { + throw new InvalidArgumentException(sprintf('Missing category "%s" for column %s in trained encoder', $feature, $column)); + } + + $encoded = array_merge( + $encoded, + $this->categories[$column][$feature] ?? array_fill(0, count($this->categories[$column]), 0) + ); + } + + return $encoded; + } +} diff --git a/tests/Preprocessing/OneHotEncoderTest.php b/tests/Preprocessing/OneHotEncoderTest.php new file mode 100644 index 0000000..a5666b7 --- /dev/null +++ b/tests/Preprocessing/OneHotEncoderTest.php @@ -0,0 +1,66 @@ +fit($samples); + $encoder->transform($samples); + + self::assertEquals([ + [1, 0, 1, 0, 1, 0], + [0, 1, 1, 0, 1, 0], + [1, 0, 0, 1, 0, 1], + [0, 1, 0, 1, 1, 0], + ], $samples); + } + + public function testThrowExceptionWhenUnknownCategory(): void + { + $encoder = new OneHotEncoder(); + $encoder->fit([ + ['fish', 'New York', 'regression'], + ['dog', 'New York', 'regression'], + ['fish', 'Vancouver', 'classification'], + ['dog', 'Vancouver', 'regression'], + ]); + $samples = [['fish', 'New York', 'ka boom']]; + + $this->expectException(InvalidArgumentException::class); + + $encoder->transform($samples); + } + + public function testIgnoreMissingCategory(): void + { + $encoder = new OneHotEncoder(true); + $encoder->fit([ + ['fish', 'New York', 'regression'], + ['dog', 'New York', 'regression'], + ['fish', 'Vancouver', 'classification'], + ['dog', 'Vancouver', 'regression'], + ]); + $samples = [['ka', 'boom', 'riko']]; + $encoder->transform($samples); + + self::assertEquals([ + [0, 0, 0, 0, 0, 0], + ], $samples); + } +}