Implement first regression scoring function UnivariateLinearRegression

This commit is contained in:
Arkadiusz Kondas 2018-02-12 00:26:34 +01:00
parent fbf84ca95f
commit 9e5b3a0c69
5 changed files with 202 additions and 2 deletions

View File

@ -0,0 +1,81 @@
<?php
declare(strict_types=1);
namespace Phpml\FeatureSelection\ScoringFunction;
use Phpml\FeatureSelection\ScoringFunction;
use Phpml\Math\Matrix;
use Phpml\Math\Statistic\Mean;
/**
* Quick linear model for testing the effect of a single regressor,
* sequentially for many regressors.
*
* This is done in 2 steps:
*
* 1. The cross correlation between each regressor and the target is computed,
* that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *std(y)).
* 2. It is converted to an F score then to a p-value.
*
* Ported from scikit-learn f_regression function (http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html#sklearn.feature_selection.f_regression)
*/
final class UnivariateLinearRegression implements ScoringFunction
{
/**
* @var bool
*/
private $center;
/**
* @param bool $center - if true samples and targets will be centered
*/
public function __construct(bool $center = true)
{
$this->center = $center;
}
public function score(array $samples, array $targets): array
{
if ($this->center) {
$this->centerTargets($targets);
$this->centerSamples($samples);
}
$correlations = [];
foreach ($samples[0] as $index => $feature) {
$featureColumn = array_column($samples, $index);
$correlations[$index] =
(Matrix::dot($targets, $featureColumn)[0] / (new Matrix($featureColumn, false))->transpose()->frobeniusNorm())
/ (new Matrix($targets, false))->frobeniusNorm();
}
$degreesOfFreedom = count($targets) - ($this->center ? 2 : 1);
return array_map(function (float $correlation) use ($degreesOfFreedom): float {
return $correlation ** 2 / (1 - $correlation ** 2) * $degreesOfFreedom;
}, $correlations);
}
private function centerTargets(&$targets): void
{
$mean = Mean::arithmetic($targets);
foreach ($targets as &$target) {
$target -= $mean;
}
}
private function centerSamples(&$samples): void
{
$means = [];
foreach ($samples[0] as $index => $feature) {
$means[$index] = Mean::arithmetic(array_column($samples, $index));
}
foreach ($samples as &$sample) {
foreach ($sample as $index => &$feature) {
$feature -= $means[$index];
}
}
}
}

View File

@ -236,6 +236,29 @@ class Matrix
return $this->getDeterminant() == 0;
}
/**
* Frobenius norm (HilbertSchmidt norm, Euclidean norm) (‖A‖F)
* Square root of the sum of the square of all elements.
*
* https://en.wikipedia.org/wiki/Matrix_norm#Frobenius_norm
*
* _____________
* /
* ‖A‖F = Σ Σ |aᵢⱼ|²
* ᵢ₌₁ ᵢ₌₁
*/
public function frobeniusNorm(): float
{
$squareSum = 0;
for ($i = 0; $i < $this->rows; ++$i) {
for ($j = 0; $j < $this->columns; ++$j) {
$squareSum += ($this->matrix[$i][$j]) ** 2;
}
}
return sqrt($squareSum);
}
/**
* Returns the transpose of given array
*/
@ -259,7 +282,7 @@ class Matrix
/**
* Element-wise addition or substraction depending on the given sign parameter
*/
protected function _add(self $other, int $sign = 1): self
private function _add(self $other, int $sign = 1): self
{
$a1 = $this->toArray();
$a2 = $other->toArray();
@ -277,7 +300,7 @@ class Matrix
/**
* Returns diagonal identity matrix of the same size of this matrix
*/
protected function getIdentity(): self
private function getIdentity(): self
{
$array = array_fill(0, $this->rows, array_fill(0, $this->columns, 0));
for ($i = 0; $i < $this->rows; ++$i) {

View File

@ -0,0 +1,29 @@
<?php
declare(strict_types=1);
namespace Phpml\Tests\FeatureSelection\ScoringFunction;
use Phpml\FeatureSelection\ScoringFunction\UnivariateLinearRegression;
use PHPUnit\Framework\TestCase;
final class UnivariateLinearRegressionTest extends TestCase
{
public function testRegressionScore(): void
{
$samples = [[73676, 1996], [77006, 1998], [10565, 2000], [146088, 1995], [15000, 2001], [65940, 2000], [9300, 2000], [93739, 1996], [153260, 1994], [17764, 2002], [57000, 1998], [15000, 2000]];
$targets = [2000, 2750, 15500, 960, 4400, 8800, 7100, 2550, 1025, 5900, 4600, 4400];
$function = new UnivariateLinearRegression();
self::assertEquals([6.97286, 6.48558], $function->score($samples, $targets), '', 0.0001);
}
public function testRegressionScoreWithoutCenter(): void
{
$samples = [[73676, 1996], [77006, 1998], [10565, 2000], [146088, 1995], [15000, 2001], [65940, 2000], [9300, 2000], [93739, 1996], [153260, 1994], [17764, 2002], [57000, 1998], [15000, 2000]];
$targets = [2000, 2750, 15500, 960, 4400, 8800, 7100, 2550, 1025, 5900, 4600, 4400];
$function = new UnivariateLinearRegression(false);
self::assertEquals([1.74450, 18.08347], $function->score($samples, $targets), '', 0.0001);
}
}

View File

@ -8,6 +8,7 @@ use Phpml\Dataset\Demo\IrisDataset;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Exception\InvalidOperationException;
use Phpml\FeatureSelection\ScoringFunction\ANOVAFValue;
use Phpml\FeatureSelection\ScoringFunction\UnivariateLinearRegression;
use Phpml\FeatureSelection\SelectKBest;
use PHPUnit\Framework\TestCase;
@ -45,6 +46,21 @@ final class SelectKBestTest extends TestCase
self::assertEquals(2, count($samples[0]));
}
public function testSelectKBestWithRegressionScoring(): void
{
$samples = [[73676, 1996, 2], [77006, 1998, 5], [10565, 2000, 4], [146088, 1995, 2], [15000, 2001, 2], [65940, 2000, 2], [9300, 2000, 2], [93739, 1996, 2], [153260, 1994, 2], [17764, 2002, 2], [57000, 1998, 2], [15000, 2000, 2]];
$targets = [2000, 2750, 15500, 960, 4400, 8800, 7100, 2550, 1025, 5900, 4600, 4400];
$selector = new SelectKBest(new UnivariateLinearRegression(), 2);
$selector->fit($samples, $targets);
$selector->transform($samples);
self::assertEquals(
[[73676, 1996], [77006, 1998], [10565, 2000], [146088, 1995], [15000, 2001], [65940, 2000], [9300, 2000], [93739, 1996], [153260, 1994], [17764, 2002], [57000, 1998], [15000, 2000]],
$samples
);
}
public function testThrowExceptionOnEmptyTargets(): void
{
$this->expectException(InvalidArgumentException::class);

View File

@ -251,4 +251,55 @@ class MatrixTest extends TestCase
$dot = [6, 12];
$this->assertEquals($dot, Matrix::dot($matrix2, $matrix1));
}
/**
* @dataProvider dataProviderForFrobeniusNorm
*/
public function testFrobeniusNorm(array $matrix, float $norm): void
{
$matrix = new Matrix($matrix);
$this->assertEquals($norm, $matrix->frobeniusNorm(), '', 0.0001);
}
public function dataProviderForFrobeniusNorm()
{
return [
[
[
[1, -7],
[2, 3],
], 7.93725,
],
[
[
[1, 2, 3],
[2, 3, 4],
[3, 4, 5],
], 9.643651,
],
[
[
[1, 5, 3, 9],
[2, 3, 4, 12],
[4, 2, 5, 11],
], 21.330729,
],
[
[
[1, 5, 3],
[2, 3, 4],
[4, 2, 5],
[6, 6, 3],
], 13.784049,
],
[
[
[5, -4, 2],
[-1, 2, 3],
[-2, 1, 0],
], 8,
],
];
}
}