mirror of
https://github.com/Llewellynvdm/php-ml.git
synced 2024-11-21 20:45:10 +00:00
Implement SelectKBest algo for feature selection
This commit is contained in:
parent
52c9ba8291
commit
fbf84ca95f
@ -20,7 +20,7 @@ class TfIdfTransformer implements Transformer
|
||||
}
|
||||
}
|
||||
|
||||
public function fit(array $samples): void
|
||||
public function fit(array $samples, ?array $targets = null): void
|
||||
{
|
||||
$this->countTokensFrequency($samples);
|
||||
|
||||
|
@ -41,7 +41,7 @@ class TokenCountVectorizer implements Transformer
|
||||
$this->minDF = $minDF;
|
||||
}
|
||||
|
||||
public function fit(array $samples): void
|
||||
public function fit(array $samples, ?array $targets = null): void
|
||||
{
|
||||
$this->buildVocabulary($samples);
|
||||
}
|
||||
|
10
src/FeatureSelection/ScoringFunction.php
Normal file
10
src/FeatureSelection/ScoringFunction.php
Normal file
@ -0,0 +1,10 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\FeatureSelection;
|
||||
|
||||
interface ScoringFunction
|
||||
{
|
||||
public function score(array $samples, array $targets): array;
|
||||
}
|
21
src/FeatureSelection/ScoringFunction/ANOVAFValue.php
Normal file
21
src/FeatureSelection/ScoringFunction/ANOVAFValue.php
Normal file
@ -0,0 +1,21 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\FeatureSelection\ScoringFunction;
|
||||
|
||||
use Phpml\FeatureSelection\ScoringFunction;
|
||||
use Phpml\Math\Statistic\ANOVA;
|
||||
|
||||
final class ANOVAFValue implements ScoringFunction
|
||||
{
|
||||
public function score(array $samples, array $targets): array
|
||||
{
|
||||
$grouped = [];
|
||||
foreach ($samples as $index => $sample) {
|
||||
$grouped[$targets[$index]][] = $sample;
|
||||
}
|
||||
|
||||
return ANOVA::oneWayF(array_values($grouped));
|
||||
}
|
||||
}
|
78
src/FeatureSelection/SelectKBest.php
Normal file
78
src/FeatureSelection/SelectKBest.php
Normal file
@ -0,0 +1,78 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\FeatureSelection;
|
||||
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Exception\InvalidOperationException;
|
||||
use Phpml\FeatureSelection\ScoringFunction\ANOVAFValue;
|
||||
use Phpml\Transformer;
|
||||
|
||||
final class SelectKBest implements Transformer
|
||||
{
|
||||
/**
|
||||
* @var ScoringFunction
|
||||
*/
|
||||
private $scoringFunction;
|
||||
|
||||
/**
|
||||
* @var int
|
||||
*/
|
||||
private $k;
|
||||
|
||||
/**
|
||||
* @var array|null
|
||||
*/
|
||||
private $scores = null;
|
||||
|
||||
/**
|
||||
* @var array|null
|
||||
*/
|
||||
private $keepColumns = null;
|
||||
|
||||
public function __construct(?ScoringFunction $scoringFunction = null, int $k = 10)
|
||||
{
|
||||
if ($scoringFunction === null) {
|
||||
$scoringFunction = new ANOVAFValue();
|
||||
}
|
||||
|
||||
$this->scoringFunction = $scoringFunction;
|
||||
$this->k = $k;
|
||||
}
|
||||
|
||||
public function fit(array $samples, ?array $targets = null): void
|
||||
{
|
||||
if ($targets === null || empty($targets)) {
|
||||
throw InvalidArgumentException::arrayCantBeEmpty();
|
||||
}
|
||||
|
||||
$this->scores = $sorted = $this->scoringFunction->score($samples, $targets);
|
||||
if ($this->k >= count($sorted)) {
|
||||
return;
|
||||
}
|
||||
|
||||
arsort($sorted);
|
||||
$this->keepColumns = array_slice($sorted, 0, $this->k, true);
|
||||
}
|
||||
|
||||
public function transform(array &$samples): void
|
||||
{
|
||||
if ($this->keepColumns === null) {
|
||||
return;
|
||||
}
|
||||
|
||||
foreach ($samples as &$sample) {
|
||||
$sample = array_values(array_intersect_key($sample, $this->keepColumns));
|
||||
}
|
||||
}
|
||||
|
||||
public function scores(): array
|
||||
{
|
||||
if ($this->scores === null) {
|
||||
throw new InvalidOperationException('SelectKBest require to fit first to get scores');
|
||||
}
|
||||
|
||||
return $this->scores;
|
||||
}
|
||||
}
|
@ -33,11 +33,9 @@ final class VarianceThreshold implements Transformer
|
||||
}
|
||||
|
||||
$this->threshold = $threshold;
|
||||
$this->variances = [];
|
||||
$this->keepColumns = [];
|
||||
}
|
||||
|
||||
public function fit(array $samples): void
|
||||
public function fit(array $samples, ?array $targets = null): void
|
||||
{
|
||||
$this->variances = array_map(function (array $column) {
|
||||
return Variance::population($column);
|
||||
|
137
src/Math/Statistic/ANOVA.php
Normal file
137
src/Math/Statistic/ANOVA.php
Normal file
@ -0,0 +1,137 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\Math\Statistic;
|
||||
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
|
||||
/**
|
||||
* Analysis of variance
|
||||
* https://en.wikipedia.org/wiki/Analysis_of_variance
|
||||
*/
|
||||
final class ANOVA
|
||||
{
|
||||
/**
|
||||
* The one-way ANOVA tests the null hypothesis that 2 or more groups have
|
||||
* the same population mean. The test is applied to samples from two or
|
||||
* more groups, possibly with differing sizes.
|
||||
*
|
||||
* @param array|array[] $samples - each row is class samples
|
||||
*
|
||||
* @return array|float[]
|
||||
*/
|
||||
public static function oneWayF(array $samples): array
|
||||
{
|
||||
$classes = count($samples);
|
||||
if ($classes < 2) {
|
||||
throw InvalidArgumentException::arraySizeToSmall(2);
|
||||
}
|
||||
|
||||
$samplesPerClass = array_map(function (array $class): int {
|
||||
return count($class);
|
||||
}, $samples);
|
||||
$allSamples = array_sum($samplesPerClass);
|
||||
$ssAllSamples = self::sumOfSquaresPerFeature($samples);
|
||||
$sumSamples = self::sumOfFeaturesPerClass($samples);
|
||||
$squareSumSamples = self::sumOfSquares($sumSamples);
|
||||
$sumSamplesSquare = self::squaresSum($sumSamples);
|
||||
$ssbn = self::calculateSsbn($samples, $sumSamplesSquare, $samplesPerClass, $squareSumSamples, $allSamples);
|
||||
$sswn = self::calculateSswn($ssbn, $ssAllSamples, $squareSumSamples, $allSamples);
|
||||
$dfbn = $classes - 1;
|
||||
$dfwn = $allSamples - $classes;
|
||||
|
||||
$msb = array_map(function ($s) use ($dfbn) {
|
||||
return $s / $dfbn;
|
||||
}, $ssbn);
|
||||
$msw = array_map(function ($s) use ($dfwn) {
|
||||
return $s / $dfwn;
|
||||
}, $sswn);
|
||||
|
||||
$f = [];
|
||||
foreach ($msb as $index => $msbValue) {
|
||||
$f[$index] = $msbValue / $msw[$index];
|
||||
}
|
||||
|
||||
return $f;
|
||||
}
|
||||
|
||||
private static function sumOfSquaresPerFeature(array $samples): array
|
||||
{
|
||||
$sum = array_fill(0, count($samples[0][0]), 0);
|
||||
foreach ($samples as $class) {
|
||||
foreach ($class as $sample) {
|
||||
foreach ($sample as $index => $feature) {
|
||||
$sum[$index] += $feature ** 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $sum;
|
||||
}
|
||||
|
||||
private static function sumOfFeaturesPerClass(array $samples): array
|
||||
{
|
||||
return array_map(function (array $class) {
|
||||
$sum = array_fill(0, count($class[0]), 0);
|
||||
foreach ($class as $sample) {
|
||||
foreach ($sample as $index => $feature) {
|
||||
$sum[$index] += $feature;
|
||||
}
|
||||
}
|
||||
|
||||
return $sum;
|
||||
}, $samples);
|
||||
}
|
||||
|
||||
private static function sumOfSquares(array $sums): array
|
||||
{
|
||||
$squares = array_fill(0, count($sums[0]), 0);
|
||||
foreach ($sums as $row) {
|
||||
foreach ($row as $index => $sum) {
|
||||
$squares[$index] += $sum;
|
||||
}
|
||||
}
|
||||
|
||||
return array_map(function ($sum) {
|
||||
return $sum ** 2;
|
||||
}, $squares);
|
||||
}
|
||||
|
||||
private static function squaresSum(array $sums): array
|
||||
{
|
||||
foreach ($sums as &$row) {
|
||||
foreach ($row as &$sum) {
|
||||
$sum = $sum ** 2;
|
||||
}
|
||||
}
|
||||
|
||||
return $sums;
|
||||
}
|
||||
|
||||
private static function calculateSsbn(array $samples, array $sumSamplesSquare, array $samplesPerClass, array $squareSumSamples, int $allSamples): array
|
||||
{
|
||||
$ssbn = array_fill(0, count($samples[0][0]), 0);
|
||||
foreach ($sumSamplesSquare as $classIndex => $class) {
|
||||
foreach ($class as $index => $feature) {
|
||||
$ssbn[$index] += $feature / $samplesPerClass[$classIndex];
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($squareSumSamples as $index => $sum) {
|
||||
$ssbn[$index] -= $sum / $allSamples;
|
||||
}
|
||||
|
||||
return $ssbn;
|
||||
}
|
||||
|
||||
private static function calculateSswn(array $ssbn, array $ssAllSamples, array $squareSumSamples, int $allSamples): array
|
||||
{
|
||||
$sswn = [];
|
||||
foreach ($ssAllSamples as $index => $ss) {
|
||||
$sswn[$index] = ($ss - $squareSumSamples[$index] / $allSamples) - $ssbn[$index];
|
||||
}
|
||||
|
||||
return $sswn;
|
||||
}
|
||||
}
|
@ -43,7 +43,7 @@ class Imputer implements Preprocessor
|
||||
$this->samples = $samples;
|
||||
}
|
||||
|
||||
public function fit(array $samples): void
|
||||
public function fit(array $samples, ?array $targets = null): void
|
||||
{
|
||||
$this->samples = $samples;
|
||||
}
|
||||
|
@ -48,7 +48,7 @@ class Normalizer implements Preprocessor
|
||||
$this->norm = $norm;
|
||||
}
|
||||
|
||||
public function fit(array $samples): void
|
||||
public function fit(array $samples, ?array $targets = null): void
|
||||
{
|
||||
if ($this->fitted) {
|
||||
return;
|
||||
|
@ -7,12 +7,9 @@ namespace Phpml;
|
||||
interface Transformer
|
||||
{
|
||||
/**
|
||||
* @param array $samples
|
||||
* most transformers don't require targets to train so null allow to use fit method without setting targets
|
||||
*/
|
||||
public function fit(array $samples);
|
||||
public function fit(array $samples, ?array $targets = null): void;
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
*/
|
||||
public function transform(array &$samples);
|
||||
public function transform(array &$samples): void;
|
||||
}
|
||||
|
@ -59,7 +59,7 @@ class MLPClassifierTest extends TestCase
|
||||
public function testBackpropagationLearning(): void
|
||||
{
|
||||
// Single layer 2 classes.
|
||||
$network = new MLPClassifier(2, [2], ['a', 'b']);
|
||||
$network = new MLPClassifier(2, [2], ['a', 'b'], 1000);
|
||||
$network->train(
|
||||
[[1, 0], [0, 1], [1, 1], [0, 0]],
|
||||
['a', 'b', 'a', 'b']
|
||||
@ -118,7 +118,7 @@ class MLPClassifierTest extends TestCase
|
||||
public function testBackpropagationLearningMultilayer(): void
|
||||
{
|
||||
// Multi-layer 2 classes.
|
||||
$network = new MLPClassifier(5, [3, 2], ['a', 'b', 'c']);
|
||||
$network = new MLPClassifier(5, [3, 2], ['a', 'b', 'c'], 2000);
|
||||
$network->train(
|
||||
[[1, 0, 0, 0, 0], [0, 1, 1, 0, 0], [1, 1, 1, 1, 1], [0, 0, 0, 0, 0]],
|
||||
['a', 'b', 'a', 'c']
|
||||
@ -133,7 +133,7 @@ class MLPClassifierTest extends TestCase
|
||||
public function testBackpropagationLearningMulticlass(): void
|
||||
{
|
||||
// Multi-layer more than 2 classes.
|
||||
$network = new MLPClassifier(5, [3, 2], ['a', 'b', 4]);
|
||||
$network = new MLPClassifier(5, [3, 2], ['a', 'b', 4], 1000);
|
||||
$network->train(
|
||||
[[1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1, 1, 0], [1, 1, 1, 1, 1], [0, 0, 0, 0, 0]],
|
||||
['a', 'b', 'a', 'a', 4]
|
||||
@ -151,7 +151,7 @@ class MLPClassifierTest extends TestCase
|
||||
*/
|
||||
public function testBackpropagationActivationFunctions(ActivationFunction $activationFunction): void
|
||||
{
|
||||
$network = new MLPClassifier(5, [3], ['a', 'b'], 10000, $activationFunction);
|
||||
$network = new MLPClassifier(5, [3], ['a', 'b'], 1000, $activationFunction);
|
||||
$network->train(
|
||||
[[1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1, 1, 0], [1, 1, 1, 1, 1]],
|
||||
['a', 'b', 'a', 'a']
|
||||
@ -178,7 +178,7 @@ class MLPClassifierTest extends TestCase
|
||||
// Instantinate new Percetron trained for OR problem
|
||||
$samples = [[0, 0], [1, 0], [0, 1], [1, 1]];
|
||||
$targets = [0, 1, 1, 1];
|
||||
$classifier = new MLPClassifier(2, [2], [0, 1]);
|
||||
$classifier = new MLPClassifier(2, [2], [0, 1], 1000);
|
||||
$classifier->train($samples, $targets);
|
||||
$testSamples = [[0, 0], [1, 0], [0, 1], [1, 1]];
|
||||
$predicted = $classifier->predict($testSamples);
|
||||
|
25
tests/FeatureSelection/ScoringFunction/ANOVAFValueTest.php
Normal file
25
tests/FeatureSelection/ScoringFunction/ANOVAFValueTest.php
Normal file
@ -0,0 +1,25 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\Tests\FeatureSelection\ScoringFunction;
|
||||
|
||||
use Phpml\Dataset\Demo\IrisDataset;
|
||||
use Phpml\FeatureSelection\ScoringFunction\ANOVAFValue;
|
||||
use PHPUnit\Framework\TestCase;
|
||||
|
||||
final class ANOVAFValueTest extends TestCase
|
||||
{
|
||||
public function testScoreForANOVAFValue(): void
|
||||
{
|
||||
$dataset = new IrisDataset();
|
||||
$function = new ANOVAFValue();
|
||||
|
||||
self::assertEquals(
|
||||
[119.2645, 47.3644, 1179.0343, 959.3244],
|
||||
$function->score($dataset->getSamples(), $dataset->getTargets()),
|
||||
'',
|
||||
0.0001
|
||||
);
|
||||
}
|
||||
}
|
61
tests/FeatureSelection/SelectKBestTest.php
Normal file
61
tests/FeatureSelection/SelectKBestTest.php
Normal file
@ -0,0 +1,61 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\Tests\FeatureSelection;
|
||||
|
||||
use Phpml\Dataset\Demo\IrisDataset;
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Exception\InvalidOperationException;
|
||||
use Phpml\FeatureSelection\ScoringFunction\ANOVAFValue;
|
||||
use Phpml\FeatureSelection\SelectKBest;
|
||||
use PHPUnit\Framework\TestCase;
|
||||
|
||||
final class SelectKBestTest extends TestCase
|
||||
{
|
||||
public function testSelectKBestWithDefaultScoringFunction(): void
|
||||
{
|
||||
$samples = [[1, 2, 1], [1, 3, 4], [5, 2, 1], [1, 3, 3], [1, 3, 4], [0, 3, 5]];
|
||||
$targets = ['a', 'a', 'a', 'b', 'b', 'b'];
|
||||
$selector = new SelectKBest(null, 2);
|
||||
$selector->fit($samples, $targets);
|
||||
$selector->transform($samples);
|
||||
|
||||
self::assertEquals([[2, 1], [3, 4], [2, 1], [3, 3], [3, 4], [3, 5]], $samples);
|
||||
}
|
||||
|
||||
public function testSelectKBestWithKBiggerThanFeatures(): void
|
||||
{
|
||||
$samples = [[1, 2, 1], [1, 3, 4], [5, 2, 1], [1, 3, 3], [1, 3, 4], [0, 3, 5]];
|
||||
$targets = ['a', 'a', 'a', 'b', 'b', 'b'];
|
||||
$selector = new SelectKBest(null, 4);
|
||||
$selector->fit($samples, $targets);
|
||||
$selector->transform($samples);
|
||||
|
||||
self::assertEquals([[1, 2, 1], [1, 3, 4], [5, 2, 1], [1, 3, 3], [1, 3, 4], [0, 3, 5]], $samples);
|
||||
}
|
||||
|
||||
public function testSelectKBestWithIrisDataset(): void
|
||||
{
|
||||
$dataset = new IrisDataset();
|
||||
$selector = new SelectKBest(new ANOVAFValue(), 2);
|
||||
$selector->fit($samples = $dataset->getSamples(), $dataset->getTargets());
|
||||
$selector->transform($samples);
|
||||
|
||||
self::assertEquals(2, count($samples[0]));
|
||||
}
|
||||
|
||||
public function testThrowExceptionOnEmptyTargets(): void
|
||||
{
|
||||
$this->expectException(InvalidArgumentException::class);
|
||||
$selector = new SelectKBest(new ANOVAFValue(), 2);
|
||||
$selector->fit([[1, 2, 3], [4, 5, 6]], []);
|
||||
}
|
||||
|
||||
public function testThrowExceptionWhenNotTrained(): void
|
||||
{
|
||||
$this->expectException(InvalidOperationException::class);
|
||||
$selector = new SelectKBest(new ANOVAFValue(), 2);
|
||||
$selector->scores();
|
||||
}
|
||||
}
|
44
tests/Math/Statistic/ANOVATest.php
Normal file
44
tests/Math/Statistic/ANOVATest.php
Normal file
@ -0,0 +1,44 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\Tests\Math\Statistic;
|
||||
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
use Phpml\Math\Statistic\ANOVA;
|
||||
use PHPUnit\Framework\TestCase;
|
||||
|
||||
final class ANOVATest extends TestCase
|
||||
{
|
||||
public function testOneWayF(): void
|
||||
{
|
||||
$samples = [
|
||||
[[1, 2, 1], [1, 3, 4], [5, 2, 1]],
|
||||
[[1, 3, 3], [1, 3, 4], [0, 3, 5]],
|
||||
];
|
||||
|
||||
$f = [1.47058824, 4.0, 3.0];
|
||||
|
||||
self::assertEquals($f, ANOVA::oneWayF($samples), '', 0.00000001);
|
||||
}
|
||||
|
||||
public function testOneWayFWithDifferingSizes(): void
|
||||
{
|
||||
$samples = [
|
||||
[[1, 2, 1], [1, 3, 4], [5, 2, 1]],
|
||||
[[1, 3, 3], [1, 3, 4]],
|
||||
];
|
||||
|
||||
self::assertEquals([0.6, 2.4, 1.24615385], ANOVA::oneWayF($samples), '', 0.00000001);
|
||||
}
|
||||
|
||||
public function testThrowExceptionOnToSmallSamples(): void
|
||||
{
|
||||
$this->expectException(InvalidArgumentException::class);
|
||||
$samples = [
|
||||
[[1, 2, 1], [1, 3, 4], [5, 2, 1]],
|
||||
];
|
||||
|
||||
ANOVA::oneWayF($samples);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user