From fbf84ca95fa26650be4705ca85a36502094ad7bc Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sun, 11 Feb 2018 22:10:12 +0100 Subject: [PATCH] Implement SelectKBest algo for feature selection --- src/FeatureExtraction/TfIdfTransformer.php | 2 +- .../TokenCountVectorizer.php | 2 +- src/FeatureSelection/ScoringFunction.php | 10 ++ .../ScoringFunction/ANOVAFValue.php | 21 +++ src/FeatureSelection/SelectKBest.php | 78 ++++++++++ src/FeatureSelection/VarianceThreshold.php | 4 +- src/Math/Statistic/ANOVA.php | 137 ++++++++++++++++++ src/Preprocessing/Imputer.php | 2 +- src/Preprocessing/Normalizer.php | 2 +- src/Transformer.php | 9 +- tests/Classification/MLPClassifierTest.php | 10 +- .../ScoringFunction/ANOVAFValueTest.php | 25 ++++ tests/FeatureSelection/SelectKBestTest.php | 61 ++++++++ tests/Math/Statistic/ANOVATest.php | 44 ++++++ 14 files changed, 389 insertions(+), 18 deletions(-) create mode 100644 src/FeatureSelection/ScoringFunction.php create mode 100644 src/FeatureSelection/ScoringFunction/ANOVAFValue.php create mode 100644 src/FeatureSelection/SelectKBest.php create mode 100644 src/Math/Statistic/ANOVA.php create mode 100644 tests/FeatureSelection/ScoringFunction/ANOVAFValueTest.php create mode 100644 tests/FeatureSelection/SelectKBestTest.php create mode 100644 tests/Math/Statistic/ANOVATest.php diff --git a/src/FeatureExtraction/TfIdfTransformer.php b/src/FeatureExtraction/TfIdfTransformer.php index 4b678a4..30f0203 100644 --- a/src/FeatureExtraction/TfIdfTransformer.php +++ b/src/FeatureExtraction/TfIdfTransformer.php @@ -20,7 +20,7 @@ class TfIdfTransformer implements Transformer } } - public function fit(array $samples): void + public function fit(array $samples, ?array $targets = null): void { $this->countTokensFrequency($samples); diff --git a/src/FeatureExtraction/TokenCountVectorizer.php b/src/FeatureExtraction/TokenCountVectorizer.php index e0d4e10..8c757c0 100644 --- a/src/FeatureExtraction/TokenCountVectorizer.php +++ b/src/FeatureExtraction/TokenCountVectorizer.php @@ -41,7 +41,7 @@ class TokenCountVectorizer implements Transformer $this->minDF = $minDF; } - public function fit(array $samples): void + public function fit(array $samples, ?array $targets = null): void { $this->buildVocabulary($samples); } diff --git a/src/FeatureSelection/ScoringFunction.php b/src/FeatureSelection/ScoringFunction.php new file mode 100644 index 0000000..4c925f6 --- /dev/null +++ b/src/FeatureSelection/ScoringFunction.php @@ -0,0 +1,10 @@ + $sample) { + $grouped[$targets[$index]][] = $sample; + } + + return ANOVA::oneWayF(array_values($grouped)); + } +} diff --git a/src/FeatureSelection/SelectKBest.php b/src/FeatureSelection/SelectKBest.php new file mode 100644 index 0000000..8f4b273 --- /dev/null +++ b/src/FeatureSelection/SelectKBest.php @@ -0,0 +1,78 @@ +scoringFunction = $scoringFunction; + $this->k = $k; + } + + public function fit(array $samples, ?array $targets = null): void + { + if ($targets === null || empty($targets)) { + throw InvalidArgumentException::arrayCantBeEmpty(); + } + + $this->scores = $sorted = $this->scoringFunction->score($samples, $targets); + if ($this->k >= count($sorted)) { + return; + } + + arsort($sorted); + $this->keepColumns = array_slice($sorted, 0, $this->k, true); + } + + public function transform(array &$samples): void + { + if ($this->keepColumns === null) { + return; + } + + foreach ($samples as &$sample) { + $sample = array_values(array_intersect_key($sample, $this->keepColumns)); + } + } + + public function scores(): array + { + if ($this->scores === null) { + throw new InvalidOperationException('SelectKBest require to fit first to get scores'); + } + + return $this->scores; + } +} diff --git a/src/FeatureSelection/VarianceThreshold.php b/src/FeatureSelection/VarianceThreshold.php index 6a3d639..5ca2332 100644 --- a/src/FeatureSelection/VarianceThreshold.php +++ b/src/FeatureSelection/VarianceThreshold.php @@ -33,11 +33,9 @@ final class VarianceThreshold implements Transformer } $this->threshold = $threshold; - $this->variances = []; - $this->keepColumns = []; } - public function fit(array $samples): void + public function fit(array $samples, ?array $targets = null): void { $this->variances = array_map(function (array $column) { return Variance::population($column); diff --git a/src/Math/Statistic/ANOVA.php b/src/Math/Statistic/ANOVA.php new file mode 100644 index 0000000..b0d0d37 --- /dev/null +++ b/src/Math/Statistic/ANOVA.php @@ -0,0 +1,137 @@ + $msbValue) { + $f[$index] = $msbValue / $msw[$index]; + } + + return $f; + } + + private static function sumOfSquaresPerFeature(array $samples): array + { + $sum = array_fill(0, count($samples[0][0]), 0); + foreach ($samples as $class) { + foreach ($class as $sample) { + foreach ($sample as $index => $feature) { + $sum[$index] += $feature ** 2; + } + } + } + + return $sum; + } + + private static function sumOfFeaturesPerClass(array $samples): array + { + return array_map(function (array $class) { + $sum = array_fill(0, count($class[0]), 0); + foreach ($class as $sample) { + foreach ($sample as $index => $feature) { + $sum[$index] += $feature; + } + } + + return $sum; + }, $samples); + } + + private static function sumOfSquares(array $sums): array + { + $squares = array_fill(0, count($sums[0]), 0); + foreach ($sums as $row) { + foreach ($row as $index => $sum) { + $squares[$index] += $sum; + } + } + + return array_map(function ($sum) { + return $sum ** 2; + }, $squares); + } + + private static function squaresSum(array $sums): array + { + foreach ($sums as &$row) { + foreach ($row as &$sum) { + $sum = $sum ** 2; + } + } + + return $sums; + } + + private static function calculateSsbn(array $samples, array $sumSamplesSquare, array $samplesPerClass, array $squareSumSamples, int $allSamples): array + { + $ssbn = array_fill(0, count($samples[0][0]), 0); + foreach ($sumSamplesSquare as $classIndex => $class) { + foreach ($class as $index => $feature) { + $ssbn[$index] += $feature / $samplesPerClass[$classIndex]; + } + } + + foreach ($squareSumSamples as $index => $sum) { + $ssbn[$index] -= $sum / $allSamples; + } + + return $ssbn; + } + + private static function calculateSswn(array $ssbn, array $ssAllSamples, array $squareSumSamples, int $allSamples): array + { + $sswn = []; + foreach ($ssAllSamples as $index => $ss) { + $sswn[$index] = ($ss - $squareSumSamples[$index] / $allSamples) - $ssbn[$index]; + } + + return $sswn; + } +} diff --git a/src/Preprocessing/Imputer.php b/src/Preprocessing/Imputer.php index 593756c..fdce666 100644 --- a/src/Preprocessing/Imputer.php +++ b/src/Preprocessing/Imputer.php @@ -43,7 +43,7 @@ class Imputer implements Preprocessor $this->samples = $samples; } - public function fit(array $samples): void + public function fit(array $samples, ?array $targets = null): void { $this->samples = $samples; } diff --git a/src/Preprocessing/Normalizer.php b/src/Preprocessing/Normalizer.php index b2721de..39b4fbc 100644 --- a/src/Preprocessing/Normalizer.php +++ b/src/Preprocessing/Normalizer.php @@ -48,7 +48,7 @@ class Normalizer implements Preprocessor $this->norm = $norm; } - public function fit(array $samples): void + public function fit(array $samples, ?array $targets = null): void { if ($this->fitted) { return; diff --git a/src/Transformer.php b/src/Transformer.php index c36e5ca..7350e2c 100644 --- a/src/Transformer.php +++ b/src/Transformer.php @@ -7,12 +7,9 @@ namespace Phpml; interface Transformer { /** - * @param array $samples + * most transformers don't require targets to train so null allow to use fit method without setting targets */ - public function fit(array $samples); + public function fit(array $samples, ?array $targets = null): void; - /** - * @param array $samples - */ - public function transform(array &$samples); + public function transform(array &$samples): void; } diff --git a/tests/Classification/MLPClassifierTest.php b/tests/Classification/MLPClassifierTest.php index c46e297..c4c45c4 100644 --- a/tests/Classification/MLPClassifierTest.php +++ b/tests/Classification/MLPClassifierTest.php @@ -59,7 +59,7 @@ class MLPClassifierTest extends TestCase public function testBackpropagationLearning(): void { // Single layer 2 classes. - $network = new MLPClassifier(2, [2], ['a', 'b']); + $network = new MLPClassifier(2, [2], ['a', 'b'], 1000); $network->train( [[1, 0], [0, 1], [1, 1], [0, 0]], ['a', 'b', 'a', 'b'] @@ -118,7 +118,7 @@ class MLPClassifierTest extends TestCase public function testBackpropagationLearningMultilayer(): void { // Multi-layer 2 classes. - $network = new MLPClassifier(5, [3, 2], ['a', 'b', 'c']); + $network = new MLPClassifier(5, [3, 2], ['a', 'b', 'c'], 2000); $network->train( [[1, 0, 0, 0, 0], [0, 1, 1, 0, 0], [1, 1, 1, 1, 1], [0, 0, 0, 0, 0]], ['a', 'b', 'a', 'c'] @@ -133,7 +133,7 @@ class MLPClassifierTest extends TestCase public function testBackpropagationLearningMulticlass(): void { // Multi-layer more than 2 classes. - $network = new MLPClassifier(5, [3, 2], ['a', 'b', 4]); + $network = new MLPClassifier(5, [3, 2], ['a', 'b', 4], 1000); $network->train( [[1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1, 1, 0], [1, 1, 1, 1, 1], [0, 0, 0, 0, 0]], ['a', 'b', 'a', 'a', 4] @@ -151,7 +151,7 @@ class MLPClassifierTest extends TestCase */ public function testBackpropagationActivationFunctions(ActivationFunction $activationFunction): void { - $network = new MLPClassifier(5, [3], ['a', 'b'], 10000, $activationFunction); + $network = new MLPClassifier(5, [3], ['a', 'b'], 1000, $activationFunction); $network->train( [[1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1, 1, 0], [1, 1, 1, 1, 1]], ['a', 'b', 'a', 'a'] @@ -178,7 +178,7 @@ class MLPClassifierTest extends TestCase // Instantinate new Percetron trained for OR problem $samples = [[0, 0], [1, 0], [0, 1], [1, 1]]; $targets = [0, 1, 1, 1]; - $classifier = new MLPClassifier(2, [2], [0, 1]); + $classifier = new MLPClassifier(2, [2], [0, 1], 1000); $classifier->train($samples, $targets); $testSamples = [[0, 0], [1, 0], [0, 1], [1, 1]]; $predicted = $classifier->predict($testSamples); diff --git a/tests/FeatureSelection/ScoringFunction/ANOVAFValueTest.php b/tests/FeatureSelection/ScoringFunction/ANOVAFValueTest.php new file mode 100644 index 0000000..7a601db --- /dev/null +++ b/tests/FeatureSelection/ScoringFunction/ANOVAFValueTest.php @@ -0,0 +1,25 @@ +score($dataset->getSamples(), $dataset->getTargets()), + '', + 0.0001 + ); + } +} diff --git a/tests/FeatureSelection/SelectKBestTest.php b/tests/FeatureSelection/SelectKBestTest.php new file mode 100644 index 0000000..a035560 --- /dev/null +++ b/tests/FeatureSelection/SelectKBestTest.php @@ -0,0 +1,61 @@ +fit($samples, $targets); + $selector->transform($samples); + + self::assertEquals([[2, 1], [3, 4], [2, 1], [3, 3], [3, 4], [3, 5]], $samples); + } + + public function testSelectKBestWithKBiggerThanFeatures(): void + { + $samples = [[1, 2, 1], [1, 3, 4], [5, 2, 1], [1, 3, 3], [1, 3, 4], [0, 3, 5]]; + $targets = ['a', 'a', 'a', 'b', 'b', 'b']; + $selector = new SelectKBest(null, 4); + $selector->fit($samples, $targets); + $selector->transform($samples); + + self::assertEquals([[1, 2, 1], [1, 3, 4], [5, 2, 1], [1, 3, 3], [1, 3, 4], [0, 3, 5]], $samples); + } + + public function testSelectKBestWithIrisDataset(): void + { + $dataset = new IrisDataset(); + $selector = new SelectKBest(new ANOVAFValue(), 2); + $selector->fit($samples = $dataset->getSamples(), $dataset->getTargets()); + $selector->transform($samples); + + self::assertEquals(2, count($samples[0])); + } + + public function testThrowExceptionOnEmptyTargets(): void + { + $this->expectException(InvalidArgumentException::class); + $selector = new SelectKBest(new ANOVAFValue(), 2); + $selector->fit([[1, 2, 3], [4, 5, 6]], []); + } + + public function testThrowExceptionWhenNotTrained(): void + { + $this->expectException(InvalidOperationException::class); + $selector = new SelectKBest(new ANOVAFValue(), 2); + $selector->scores(); + } +} diff --git a/tests/Math/Statistic/ANOVATest.php b/tests/Math/Statistic/ANOVATest.php new file mode 100644 index 0000000..81717f8 --- /dev/null +++ b/tests/Math/Statistic/ANOVATest.php @@ -0,0 +1,44 @@ +expectException(InvalidArgumentException::class); + $samples = [ + [[1, 2, 1], [1, 3, 4], [5, 2, 1]], + ]; + + ANOVA::oneWayF($samples); + } +}