Implement SelectKBest algo for feature selection

This commit is contained in:
Arkadiusz Kondas 2018-02-11 22:10:12 +01:00
parent 52c9ba8291
commit fbf84ca95f
14 changed files with 389 additions and 18 deletions

View File

@ -20,7 +20,7 @@ class TfIdfTransformer implements Transformer
}
}
public function fit(array $samples): void
public function fit(array $samples, ?array $targets = null): void
{
$this->countTokensFrequency($samples);

View File

@ -41,7 +41,7 @@ class TokenCountVectorizer implements Transformer
$this->minDF = $minDF;
}
public function fit(array $samples): void
public function fit(array $samples, ?array $targets = null): void
{
$this->buildVocabulary($samples);
}

View File

@ -0,0 +1,10 @@
<?php
declare(strict_types=1);
namespace Phpml\FeatureSelection;
interface ScoringFunction
{
public function score(array $samples, array $targets): array;
}

View File

@ -0,0 +1,21 @@
<?php
declare(strict_types=1);
namespace Phpml\FeatureSelection\ScoringFunction;
use Phpml\FeatureSelection\ScoringFunction;
use Phpml\Math\Statistic\ANOVA;
final class ANOVAFValue implements ScoringFunction
{
public function score(array $samples, array $targets): array
{
$grouped = [];
foreach ($samples as $index => $sample) {
$grouped[$targets[$index]][] = $sample;
}
return ANOVA::oneWayF(array_values($grouped));
}
}

View File

@ -0,0 +1,78 @@
<?php
declare(strict_types=1);
namespace Phpml\FeatureSelection;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Exception\InvalidOperationException;
use Phpml\FeatureSelection\ScoringFunction\ANOVAFValue;
use Phpml\Transformer;
final class SelectKBest implements Transformer
{
/**
* @var ScoringFunction
*/
private $scoringFunction;
/**
* @var int
*/
private $k;
/**
* @var array|null
*/
private $scores = null;
/**
* @var array|null
*/
private $keepColumns = null;
public function __construct(?ScoringFunction $scoringFunction = null, int $k = 10)
{
if ($scoringFunction === null) {
$scoringFunction = new ANOVAFValue();
}
$this->scoringFunction = $scoringFunction;
$this->k = $k;
}
public function fit(array $samples, ?array $targets = null): void
{
if ($targets === null || empty($targets)) {
throw InvalidArgumentException::arrayCantBeEmpty();
}
$this->scores = $sorted = $this->scoringFunction->score($samples, $targets);
if ($this->k >= count($sorted)) {
return;
}
arsort($sorted);
$this->keepColumns = array_slice($sorted, 0, $this->k, true);
}
public function transform(array &$samples): void
{
if ($this->keepColumns === null) {
return;
}
foreach ($samples as &$sample) {
$sample = array_values(array_intersect_key($sample, $this->keepColumns));
}
}
public function scores(): array
{
if ($this->scores === null) {
throw new InvalidOperationException('SelectKBest require to fit first to get scores');
}
return $this->scores;
}
}

View File

@ -33,11 +33,9 @@ final class VarianceThreshold implements Transformer
}
$this->threshold = $threshold;
$this->variances = [];
$this->keepColumns = [];
}
public function fit(array $samples): void
public function fit(array $samples, ?array $targets = null): void
{
$this->variances = array_map(function (array $column) {
return Variance::population($column);

View File

@ -0,0 +1,137 @@
<?php
declare(strict_types=1);
namespace Phpml\Math\Statistic;
use Phpml\Exception\InvalidArgumentException;
/**
* Analysis of variance
* https://en.wikipedia.org/wiki/Analysis_of_variance
*/
final class ANOVA
{
/**
* The one-way ANOVA tests the null hypothesis that 2 or more groups have
* the same population mean. The test is applied to samples from two or
* more groups, possibly with differing sizes.
*
* @param array|array[] $samples - each row is class samples
*
* @return array|float[]
*/
public static function oneWayF(array $samples): array
{
$classes = count($samples);
if ($classes < 2) {
throw InvalidArgumentException::arraySizeToSmall(2);
}
$samplesPerClass = array_map(function (array $class): int {
return count($class);
}, $samples);
$allSamples = array_sum($samplesPerClass);
$ssAllSamples = self::sumOfSquaresPerFeature($samples);
$sumSamples = self::sumOfFeaturesPerClass($samples);
$squareSumSamples = self::sumOfSquares($sumSamples);
$sumSamplesSquare = self::squaresSum($sumSamples);
$ssbn = self::calculateSsbn($samples, $sumSamplesSquare, $samplesPerClass, $squareSumSamples, $allSamples);
$sswn = self::calculateSswn($ssbn, $ssAllSamples, $squareSumSamples, $allSamples);
$dfbn = $classes - 1;
$dfwn = $allSamples - $classes;
$msb = array_map(function ($s) use ($dfbn) {
return $s / $dfbn;
}, $ssbn);
$msw = array_map(function ($s) use ($dfwn) {
return $s / $dfwn;
}, $sswn);
$f = [];
foreach ($msb as $index => $msbValue) {
$f[$index] = $msbValue / $msw[$index];
}
return $f;
}
private static function sumOfSquaresPerFeature(array $samples): array
{
$sum = array_fill(0, count($samples[0][0]), 0);
foreach ($samples as $class) {
foreach ($class as $sample) {
foreach ($sample as $index => $feature) {
$sum[$index] += $feature ** 2;
}
}
}
return $sum;
}
private static function sumOfFeaturesPerClass(array $samples): array
{
return array_map(function (array $class) {
$sum = array_fill(0, count($class[0]), 0);
foreach ($class as $sample) {
foreach ($sample as $index => $feature) {
$sum[$index] += $feature;
}
}
return $sum;
}, $samples);
}
private static function sumOfSquares(array $sums): array
{
$squares = array_fill(0, count($sums[0]), 0);
foreach ($sums as $row) {
foreach ($row as $index => $sum) {
$squares[$index] += $sum;
}
}
return array_map(function ($sum) {
return $sum ** 2;
}, $squares);
}
private static function squaresSum(array $sums): array
{
foreach ($sums as &$row) {
foreach ($row as &$sum) {
$sum = $sum ** 2;
}
}
return $sums;
}
private static function calculateSsbn(array $samples, array $sumSamplesSquare, array $samplesPerClass, array $squareSumSamples, int $allSamples): array
{
$ssbn = array_fill(0, count($samples[0][0]), 0);
foreach ($sumSamplesSquare as $classIndex => $class) {
foreach ($class as $index => $feature) {
$ssbn[$index] += $feature / $samplesPerClass[$classIndex];
}
}
foreach ($squareSumSamples as $index => $sum) {
$ssbn[$index] -= $sum / $allSamples;
}
return $ssbn;
}
private static function calculateSswn(array $ssbn, array $ssAllSamples, array $squareSumSamples, int $allSamples): array
{
$sswn = [];
foreach ($ssAllSamples as $index => $ss) {
$sswn[$index] = ($ss - $squareSumSamples[$index] / $allSamples) - $ssbn[$index];
}
return $sswn;
}
}

View File

@ -43,7 +43,7 @@ class Imputer implements Preprocessor
$this->samples = $samples;
}
public function fit(array $samples): void
public function fit(array $samples, ?array $targets = null): void
{
$this->samples = $samples;
}

View File

@ -48,7 +48,7 @@ class Normalizer implements Preprocessor
$this->norm = $norm;
}
public function fit(array $samples): void
public function fit(array $samples, ?array $targets = null): void
{
if ($this->fitted) {
return;

View File

@ -7,12 +7,9 @@ namespace Phpml;
interface Transformer
{
/**
* @param array $samples
* most transformers don't require targets to train so null allow to use fit method without setting targets
*/
public function fit(array $samples);
public function fit(array $samples, ?array $targets = null): void;
/**
* @param array $samples
*/
public function transform(array &$samples);
public function transform(array &$samples): void;
}

View File

@ -59,7 +59,7 @@ class MLPClassifierTest extends TestCase
public function testBackpropagationLearning(): void
{
// Single layer 2 classes.
$network = new MLPClassifier(2, [2], ['a', 'b']);
$network = new MLPClassifier(2, [2], ['a', 'b'], 1000);
$network->train(
[[1, 0], [0, 1], [1, 1], [0, 0]],
['a', 'b', 'a', 'b']
@ -118,7 +118,7 @@ class MLPClassifierTest extends TestCase
public function testBackpropagationLearningMultilayer(): void
{
// Multi-layer 2 classes.
$network = new MLPClassifier(5, [3, 2], ['a', 'b', 'c']);
$network = new MLPClassifier(5, [3, 2], ['a', 'b', 'c'], 2000);
$network->train(
[[1, 0, 0, 0, 0], [0, 1, 1, 0, 0], [1, 1, 1, 1, 1], [0, 0, 0, 0, 0]],
['a', 'b', 'a', 'c']
@ -133,7 +133,7 @@ class MLPClassifierTest extends TestCase
public function testBackpropagationLearningMulticlass(): void
{
// Multi-layer more than 2 classes.
$network = new MLPClassifier(5, [3, 2], ['a', 'b', 4]);
$network = new MLPClassifier(5, [3, 2], ['a', 'b', 4], 1000);
$network->train(
[[1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1, 1, 0], [1, 1, 1, 1, 1], [0, 0, 0, 0, 0]],
['a', 'b', 'a', 'a', 4]
@ -151,7 +151,7 @@ class MLPClassifierTest extends TestCase
*/
public function testBackpropagationActivationFunctions(ActivationFunction $activationFunction): void
{
$network = new MLPClassifier(5, [3], ['a', 'b'], 10000, $activationFunction);
$network = new MLPClassifier(5, [3], ['a', 'b'], 1000, $activationFunction);
$network->train(
[[1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1, 1, 0], [1, 1, 1, 1, 1]],
['a', 'b', 'a', 'a']
@ -178,7 +178,7 @@ class MLPClassifierTest extends TestCase
// Instantinate new Percetron trained for OR problem
$samples = [[0, 0], [1, 0], [0, 1], [1, 1]];
$targets = [0, 1, 1, 1];
$classifier = new MLPClassifier(2, [2], [0, 1]);
$classifier = new MLPClassifier(2, [2], [0, 1], 1000);
$classifier->train($samples, $targets);
$testSamples = [[0, 0], [1, 0], [0, 1], [1, 1]];
$predicted = $classifier->predict($testSamples);

View File

@ -0,0 +1,25 @@
<?php
declare(strict_types=1);
namespace Phpml\Tests\FeatureSelection\ScoringFunction;
use Phpml\Dataset\Demo\IrisDataset;
use Phpml\FeatureSelection\ScoringFunction\ANOVAFValue;
use PHPUnit\Framework\TestCase;
final class ANOVAFValueTest extends TestCase
{
public function testScoreForANOVAFValue(): void
{
$dataset = new IrisDataset();
$function = new ANOVAFValue();
self::assertEquals(
[119.2645, 47.3644, 1179.0343, 959.3244],
$function->score($dataset->getSamples(), $dataset->getTargets()),
'',
0.0001
);
}
}

View File

@ -0,0 +1,61 @@
<?php
declare(strict_types=1);
namespace Phpml\Tests\FeatureSelection;
use Phpml\Dataset\Demo\IrisDataset;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Exception\InvalidOperationException;
use Phpml\FeatureSelection\ScoringFunction\ANOVAFValue;
use Phpml\FeatureSelection\SelectKBest;
use PHPUnit\Framework\TestCase;
final class SelectKBestTest extends TestCase
{
public function testSelectKBestWithDefaultScoringFunction(): void
{
$samples = [[1, 2, 1], [1, 3, 4], [5, 2, 1], [1, 3, 3], [1, 3, 4], [0, 3, 5]];
$targets = ['a', 'a', 'a', 'b', 'b', 'b'];
$selector = new SelectKBest(null, 2);
$selector->fit($samples, $targets);
$selector->transform($samples);
self::assertEquals([[2, 1], [3, 4], [2, 1], [3, 3], [3, 4], [3, 5]], $samples);
}
public function testSelectKBestWithKBiggerThanFeatures(): void
{
$samples = [[1, 2, 1], [1, 3, 4], [5, 2, 1], [1, 3, 3], [1, 3, 4], [0, 3, 5]];
$targets = ['a', 'a', 'a', 'b', 'b', 'b'];
$selector = new SelectKBest(null, 4);
$selector->fit($samples, $targets);
$selector->transform($samples);
self::assertEquals([[1, 2, 1], [1, 3, 4], [5, 2, 1], [1, 3, 3], [1, 3, 4], [0, 3, 5]], $samples);
}
public function testSelectKBestWithIrisDataset(): void
{
$dataset = new IrisDataset();
$selector = new SelectKBest(new ANOVAFValue(), 2);
$selector->fit($samples = $dataset->getSamples(), $dataset->getTargets());
$selector->transform($samples);
self::assertEquals(2, count($samples[0]));
}
public function testThrowExceptionOnEmptyTargets(): void
{
$this->expectException(InvalidArgumentException::class);
$selector = new SelectKBest(new ANOVAFValue(), 2);
$selector->fit([[1, 2, 3], [4, 5, 6]], []);
}
public function testThrowExceptionWhenNotTrained(): void
{
$this->expectException(InvalidOperationException::class);
$selector = new SelectKBest(new ANOVAFValue(), 2);
$selector->scores();
}
}

View File

@ -0,0 +1,44 @@
<?php
declare(strict_types=1);
namespace Phpml\Tests\Math\Statistic;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Math\Statistic\ANOVA;
use PHPUnit\Framework\TestCase;
final class ANOVATest extends TestCase
{
public function testOneWayF(): void
{
$samples = [
[[1, 2, 1], [1, 3, 4], [5, 2, 1]],
[[1, 3, 3], [1, 3, 4], [0, 3, 5]],
];
$f = [1.47058824, 4.0, 3.0];
self::assertEquals($f, ANOVA::oneWayF($samples), '', 0.00000001);
}
public function testOneWayFWithDifferingSizes(): void
{
$samples = [
[[1, 2, 1], [1, 3, 4], [5, 2, 1]],
[[1, 3, 3], [1, 3, 4]],
];
self::assertEquals([0.6, 2.4, 1.24615385], ANOVA::oneWayF($samples), '', 0.00000001);
}
public function testThrowExceptionOnToSmallSamples(): void
{
$this->expectException(InvalidArgumentException::class);
$samples = [
[[1, 2, 1], [1, 3, 4], [5, 2, 1]],
];
ANOVA::oneWayF($samples);
}
}