Linear Discrimant Analysis (LDA) (#82)

* Linear Discrimant Analysis (LDA)

* LDA test file

* Matrix inverse via LUDecomposition

* LUDecomposition inverse() and det() applied

* Readme update for LDA
This commit is contained in:
Mustafa Karabulut 2017-04-25 09:58:02 +03:00 committed by Arkadiusz Kondas
parent 12b8b118dd
commit 5b373fa7c2
9 changed files with 736 additions and 132 deletions

View File

@ -88,8 +88,9 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples](
* [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/)
* [Tf-idf Transformer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/tf-idf-transformer/)
* Dimensionality Reduction
* PCA
* PCA (Principal Component Analysis)
* Kernel PCA
* LDA (Linear Discriminant Analysis)
* Datasets
* [Array](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/array-dataset/)
* [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/)

View File

@ -0,0 +1,98 @@
<?php declare(strict_types=1);
namespace Phpml\DimensionReduction;
use Phpml\Math\LinearAlgebra\EigenvalueDecomposition;
use Phpml\Math\Matrix;
/**
* Class to compute eigen pairs (values & vectors) of a given matrix
* with the consideration of numFeatures or totalVariance to be preserved
*
* @author hp
*/
abstract class EigenTransformerBase
{
/**
* Total variance to be conserved after the reduction
*
* @var float
*/
public $totalVariance = 0.9;
/**
* Number of features to be preserved after the reduction
*
* @var int
*/
public $numFeatures = null;
/**
* Top eigenvectors of the matrix
*
* @var array
*/
protected $eigVectors = [];
/**
* Top eigenValues of the matrix
*
* @var type
*/
protected $eigValues = [];
/**
* Calculates eigenValues and eigenVectors of the given matrix. Returns
* top eigenVectors along with the largest eigenValues. The total explained variance
* of these eigenVectors will be no less than desired $totalVariance value
*
* @param array $matrix
*/
protected function eigenDecomposition(array $matrix)
{
$eig = new EigenvalueDecomposition($matrix);
$eigVals = $eig->getRealEigenvalues();
$eigVects= $eig->getEigenvectors();
$totalEigVal = array_sum($eigVals);
// Sort eigenvalues in descending order
arsort($eigVals);
$explainedVar = 0.0;
$vectors = [];
$values = [];
foreach ($eigVals as $i => $eigVal) {
$explainedVar += $eigVal / $totalEigVal;
$vectors[] = $eigVects[$i];
$values[] = $eigVal;
if ($this->numFeatures !== null) {
if (count($vectors) == $this->numFeatures) {
break;
}
} else {
if ($explainedVar >= $this->totalVariance) {
break;
}
}
}
$this->eigValues = $values;
$this->eigVectors = $vectors;
}
/**
* Returns the reduced data
*
* @param array $data
*
* @return array
*/
protected function reduce(array $data)
{
$m1 = new Matrix($data);
$m2 = new Matrix($this->eigVectors);
return $m1->multiply($m2->transpose())->toArray();
}
}

View File

@ -86,7 +86,7 @@ class KernelPCA extends PCA
$matrix = $this->calculateKernelMatrix($this->data, $numRows);
$matrix = $this->centerMatrix($matrix, $numRows);
list($this->eigValues, $this->eigVectors) = $this->eigenDecomposition($matrix, $numRows);
$this->eigenDecomposition($matrix, $numRows);
$this->fit = true;

View File

@ -0,0 +1,247 @@
<?php
declare(strict_types=1);
namespace Phpml\DimensionReduction;
use Phpml\Math\Statistic\Mean;
use Phpml\Math\Matrix;
class LDA extends EigenTransformerBase
{
/**
* @var bool
*/
public $fit = false;
/**
* @var array
*/
public $labels;
/**
* @var array
*/
public $means;
/**
* @var array
*/
public $counts;
/**
* @var float
*/
public $overallMean;
/**
* Linear Discriminant Analysis (LDA) is used to reduce the dimensionality
* of the data. Unlike Principal Component Analysis (PCA), it is a supervised
* technique that requires the class labels in order to fit the data to a
* lower dimensional space. <br><br>
* The algorithm can be initialized by speciyfing
* either with the totalVariance(a value between 0.1 and 0.99)
* or numFeatures (number of features in the dataset) to be preserved.
*
* @param float|null $totalVariance Total explained variance to be preserved
* @param int|null $numFeatures Number of features to be preserved
*
* @throws \Exception
*/
public function __construct($totalVariance = null, $numFeatures = null)
{
if ($totalVariance !== null && ($totalVariance < 0.1 || $totalVariance > 0.99)) {
throw new \Exception("Total variance can be a value between 0.1 and 0.99");
}
if ($numFeatures !== null && $numFeatures <= 0) {
throw new \Exception("Number of features to be preserved should be greater than 0");
}
if ($totalVariance !== null && $numFeatures !== null) {
throw new \Exception("Either totalVariance or numFeatures should be specified in order to run the algorithm");
}
if ($numFeatures !== null) {
$this->numFeatures = $numFeatures;
}
if ($totalVariance !== null) {
$this->totalVariance = $totalVariance;
}
}
/**
* Trains the algorithm to transform the given data to a lower dimensional space.
*
* @param array $data
* @param array $classes
*
* @return array
*/
public function fit(array $data, array $classes) : array
{
$this->labels = $this->getLabels($classes);
$this->means = $this->calculateMeans($data, $classes);
$sW = $this->calculateClassVar($data, $classes);
$sB = $this->calculateClassCov();
$S = $sW->inverse()->multiply($sB);
$this->eigenDecomposition($S->toArray());
$this->fit = true;
return $this->reduce($data);
}
/**
* Returns unique labels in the dataset
*
* @param array $classes
*
* @return array
*/
protected function getLabels(array $classes): array
{
$counts = array_count_values($classes);
return array_keys($counts);
}
/**
* Calculates mean of each column for each class and returns
* n by m matrix where n is number of labels and m is number of columns
*
* @param type $data
* @param type $classes
*
* @return array
*/
protected function calculateMeans($data, $classes) : array
{
$means = [];
$counts= [];
$overallMean = array_fill(0, count($data[0]), 0.0);
foreach ($data as $index => $row) {
$label = array_search($classes[$index], $this->labels);
foreach ($row as $col => $val) {
if (! isset($means[$label][$col])) {
$means[$label][$col] = 0.0;
}
$means[$label][$col] += $val;
$overallMean[$col] += $val;
}
if (! isset($counts[$label])) {
$counts[$label] = 0;
}
$counts[$label]++;
}
foreach ($means as $index => $row) {
foreach ($row as $col => $sum) {
$means[$index][$col] = $sum / $counts[$index];
}
}
// Calculate overall mean of the dataset for each column
$numElements = array_sum($counts);
$map = function ($el) use ($numElements) {
return $el / $numElements;
};
$this->overallMean = array_map($map, $overallMean);
$this->counts = $counts;
return $means;
}
/**
* Returns in-class scatter matrix for each class, which
* is a n by m matrix where n is number of classes and
* m is number of columns
*
* @param array $data
* @param array $classes
*
* @return Matrix
*/
protected function calculateClassVar($data, $classes)
{
// s is an n (number of classes) by m (number of column) matrix
$s = array_fill(0, count($data[0]), array_fill(0, count($data[0]), 0));
$sW = new Matrix($s, false);
foreach ($data as $index => $row) {
$label = array_search($classes[$index], $this->labels);
$means = $this->means[$label];
$row = $this->calculateVar($row, $means);
$sW = $sW->add($row);
}
return $sW;
}
/**
* Returns between-class scatter matrix for each class, which
* is an n by m matrix where n is number of classes and
* m is number of columns
*
* @return Matrix
*/
protected function calculateClassCov()
{
// s is an n (number of classes) by m (number of column) matrix
$s = array_fill(0, count($this->overallMean), array_fill(0, count($this->overallMean), 0));
$sB = new Matrix($s, false);
foreach ($this->means as $index => $classMeans) {
$row = $this->calculateVar($classMeans, $this->overallMean);
$N = $this->counts[$index];
$sB = $sB->add($row->multiplyByScalar($N));
}
return $sB;
}
/**
* Returns the result of the calculation (x - m)T.(x - m)
*
* @param array $row
* @param array $means
*
* @return Matrix
*/
protected function calculateVar(array $row, array $means)
{
$x = new Matrix($row, false);
$m = new Matrix($means, false);
$diff = $x->subtract($m);
return $diff->transpose()->multiply($diff);
}
/**
* Transforms the given sample to a lower dimensional vector by using
* the eigenVectors obtained in the last run of <code>fit</code>.
*
* @param array $sample
*
* @return array
*/
public function transform(array $sample)
{
if (!$this->fit) {
throw new \Exception("LDA has not been fitted with respect to original dataset, please run LDA::fit() first");
}
if (! is_array($sample[0])) {
$sample = [$sample];
}
return $this->reduce($sample);
}
}

View File

@ -4,27 +4,12 @@ declare(strict_types=1);
namespace Phpml\DimensionReduction;
use Phpml\Math\LinearAlgebra\EigenvalueDecomposition;
use Phpml\Math\Statistic\Covariance;
use Phpml\Math\Statistic\Mean;
use Phpml\Math\Matrix;
class PCA
class PCA extends EigenTransformerBase
{
/**
* Total variance to be conserved after the reduction
*
* @var float
*/
public $totalVariance = 0.9;
/**
* Number of features to be preserved after the reduction
*
* @var int
*/
public $numFeatures = null;
/**
* Temporary storage for mean values for each dimension in given data
*
@ -32,20 +17,6 @@ class PCA
*/
protected $means = [];
/**
* Eigenvectors of the covariance matrix
*
* @var array
*/
protected $eigVectors = [];
/**
* Top eigenValues of the covariance matrix
*
* @var type
*/
protected $eigValues = [];
/**
* @var bool
*/
@ -100,7 +71,7 @@ class PCA
$covMatrix = Covariance::covarianceMatrix($data, array_fill(0, $n, 0));
list($this->eigValues, $this->eigVectors) = $this->eigenDecomposition($covMatrix, $n);
$this->eigenDecomposition($covMatrix);
$this->fit = true;
@ -146,63 +117,6 @@ class PCA
return $data;
}
/**
* Calculates eigenValues and eigenVectors of the given matrix. Returns
* top eigenVectors along with the largest eigenValues. The total explained variance
* of these eigenVectors will be no less than desired $totalVariance value
*
* @param array $matrix
* @param int $n
*
* @return array
*/
protected function eigenDecomposition(array $matrix, int $n)
{
$eig = new EigenvalueDecomposition($matrix);
$eigVals = $eig->getRealEigenvalues();
$eigVects= $eig->getEigenvectors();
$totalEigVal = array_sum($eigVals);
// Sort eigenvalues in descending order
arsort($eigVals);
$explainedVar = 0.0;
$vectors = [];
$values = [];
foreach ($eigVals as $i => $eigVal) {
$explainedVar += $eigVal / $totalEigVal;
$vectors[] = $eigVects[$i];
$values[] = $eigVal;
if ($this->numFeatures !== null) {
if (count($vectors) == $this->numFeatures) {
break;
}
} else {
if ($explainedVar >= $this->totalVariance) {
break;
}
}
}
return [$values, $vectors];
}
/**
* Returns the reduced data
*
* @param array $data
*
* @return array
*/
protected function reduce(array $data)
{
$m1 = new Matrix($data);
$m2 = new Matrix($this->eigVectors);
return $m1->multiply($m2->transpose())->toArray();
}
/**
* Transforms the given sample to a lower dimensional vector by using
* the eigenVectors obtained in the last run of <code>fit</code>.

View File

@ -130,10 +130,10 @@ class EigenvalueDecomposition
$this->e[$j] = $g;
}
$f = 0.0;
if ($h === 0 || $h < 1e-32) {
$h = 1e-32;
}
for ($j = 0; $j < $i; ++$j) {
if ($h === 0) {
$h = 1e-20;
}
$this->e[$j] /= $h;
$f += $this->e[$j] * $this->d[$j];
}

View File

@ -0,0 +1,297 @@
<?php declare(strict_types=1);
/**
* @package JAMA
*
* For an m-by-n matrix A with m >= n, the LU decomposition is an m-by-n
* unit lower triangular matrix L, an n-by-n upper triangular matrix U,
* and a permutation vector piv of length m so that A(piv,:) = L*U.
* If m < n, then L is m-by-m and U is m-by-n.
*
* The LU decompostion with pivoting always exists, even if the matrix is
* singular, so the constructor will never fail. The primary use of the
* LU decomposition is in the solution of square systems of simultaneous
* linear equations. This will fail if isNonsingular() returns false.
*
* @author Paul Meagher
* @author Bartosz Matosiuk
* @author Michael Bommarito
* @version 1.1
* @license PHP v3.0
*
* Slightly changed to adapt the original code to PHP-ML library
* @date 2017/04/24
* @author Mustafa Karabulut
*/
namespace Phpml\Math\LinearAlgebra;
use Phpml\Math\Matrix;
use Phpml\Exception\MatrixException;
class LUDecomposition
{
/**
* Decomposition storage
* @var array
*/
private $LU = [];
/**
* Row dimension.
* @var int
*/
private $m;
/**
* Column dimension.
* @var int
*/
private $n;
/**
* Pivot sign.
* @var int
*/
private $pivsign;
/**
* Internal storage of pivot vector.
* @var array
*/
private $piv = [];
/**
* LU Decomposition constructor.
*
* @param $A Rectangular matrix
* @return Structure to access L, U and piv.
*/
public function __construct(Matrix $A)
{
if ($A->getRows() != $A->getColumns()) {
throw MatrixException::notSquareMatrix();
}
// Use a "left-looking", dot-product, Crout/Doolittle algorithm.
$this->LU = $A->toArray();
$this->m = $A->getRows();
$this->n = $A->getColumns();
for ($i = 0; $i < $this->m; ++$i) {
$this->piv[$i] = $i;
}
$this->pivsign = 1;
$LUrowi = $LUcolj = [];
// Outer loop.
for ($j = 0; $j < $this->n; ++$j) {
// Make a copy of the j-th column to localize references.
for ($i = 0; $i < $this->m; ++$i) {
$LUcolj[$i] = &$this->LU[$i][$j];
}
// Apply previous transformations.
for ($i = 0; $i < $this->m; ++$i) {
$LUrowi = $this->LU[$i];
// Most of the time is spent in the following dot product.
$kmax = min($i, $j);
$s = 0.0;
for ($k = 0; $k < $kmax; ++$k) {
$s += $LUrowi[$k] * $LUcolj[$k];
}
$LUrowi[$j] = $LUcolj[$i] -= $s;
}
// Find pivot and exchange if necessary.
$p = $j;
for ($i = $j+1; $i < $this->m; ++$i) {
if (abs($LUcolj[$i]) > abs($LUcolj[$p])) {
$p = $i;
}
}
if ($p != $j) {
for ($k = 0; $k < $this->n; ++$k) {
$t = $this->LU[$p][$k];
$this->LU[$p][$k] = $this->LU[$j][$k];
$this->LU[$j][$k] = $t;
}
$k = $this->piv[$p];
$this->piv[$p] = $this->piv[$j];
$this->piv[$j] = $k;
$this->pivsign = $this->pivsign * -1;
}
// Compute multipliers.
if (($j < $this->m) && ($this->LU[$j][$j] != 0.0)) {
for ($i = $j+1; $i < $this->m; ++$i) {
$this->LU[$i][$j] /= $this->LU[$j][$j];
}
}
}
} // function __construct()
/**
* Get lower triangular factor.
*
* @return array Lower triangular factor
*/
public function getL()
{
$L = [];
for ($i = 0; $i < $this->m; ++$i) {
for ($j = 0; $j < $this->n; ++$j) {
if ($i > $j) {
$L[$i][$j] = $this->LU[$i][$j];
} elseif ($i == $j) {
$L[$i][$j] = 1.0;
} else {
$L[$i][$j] = 0.0;
}
}
}
return new Matrix($L);
} // function getL()
/**
* Get upper triangular factor.
*
* @return array Upper triangular factor
*/
public function getU()
{
$U = [];
for ($i = 0; $i < $this->n; ++$i) {
for ($j = 0; $j < $this->n; ++$j) {
if ($i <= $j) {
$U[$i][$j] = $this->LU[$i][$j];
} else {
$U[$i][$j] = 0.0;
}
}
}
return new Matrix($U);
} // function getU()
/**
* Return pivot permutation vector.
*
* @return array Pivot vector
*/
public function getPivot()
{
return $this->piv;
} // function getPivot()
/**
* Alias for getPivot
*
* @see getPivot
*/
public function getDoublePivot()
{
return $this->getPivot();
} // function getDoublePivot()
/**
* Is the matrix nonsingular?
*
* @return true if U, and hence A, is nonsingular.
*/
public function isNonsingular()
{
for ($j = 0; $j < $this->n; ++$j) {
if ($this->LU[$j][$j] == 0) {
return false;
}
}
return true;
} // function isNonsingular()
/**
* Count determinants
*
* @return array d matrix deterninat
*/
public function det()
{
if ($this->m == $this->n) {
$d = $this->pivsign;
for ($j = 0; $j < $this->n; ++$j) {
$d *= $this->LU[$j][$j];
}
return $d;
} else {
throw MatrixException::notSquareMatrix();
}
} // function det()
/**
* Solve A*X = B
*
* @param Matrix $B A Matrix with as many rows as A and any number of columns.
*
* @return array X so that L*U*X = B(piv,:)
*
* @throws MatrixException
*/
public function solve(Matrix $B)
{
if ($B->getRows() != $this->m) {
throw MatrixException::notSquareMatrix();
}
if (! $this->isNonsingular()) {
throw MatrixException::singularMatrix();
}
// Copy right hand side with pivoting
$nx = $B->getColumns();
$X = $this->getSubMatrix($B->toArray(), $this->piv, 0, $nx-1);
// Solve L*Y = B(piv,:)
for ($k = 0; $k < $this->n; ++$k) {
for ($i = $k+1; $i < $this->n; ++$i) {
for ($j = 0; $j < $nx; ++$j) {
$X[$i][$j] -= $X[$k][$j] * $this->LU[$i][$k];
}
}
}
// Solve U*X = Y;
for ($k = $this->n-1; $k >= 0; --$k) {
for ($j = 0; $j < $nx; ++$j) {
$X[$k][$j] /= $this->LU[$k][$k];
}
for ($i = 0; $i < $k; ++$i) {
for ($j = 0; $j < $nx; ++$j) {
$X[$i][$j] -= $X[$k][$j] * $this->LU[$i][$k];
}
}
}
return $X;
} // function solve()
/**
* @param Matrix $matrix
* @param int $j0
* @param int $jF
*
* @return array
*/
protected function getSubMatrix(array $matrix, array $RL, int $j0, int $jF)
{
$m = count($RL);
$n = $jF - $j0;
$R = array_fill(0, $m, array_fill(0, $n+1, 0.0));
for ($i = 0; $i < $m; ++$i) {
for ($j = $j0; $j <= $jF; ++$j) {
$R[$i][$j - $j0]= $matrix[ $RL[$i] ][$j];
}
}
return $R;
}
} // class LUDecomposition

View File

@ -4,6 +4,7 @@ declare(strict_types=1);
namespace Phpml\Math;
use Phpml\Math\LinearAlgebra\LUDecomposition;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Exception\MatrixException;
@ -137,32 +138,8 @@ class Matrix
throw MatrixException::notSquareMatrix();
}
return $this->determinant = $this->calculateDeterminant();
}
/**
* @return float|int
*
* @throws MatrixException
*/
private function calculateDeterminant()
{
$determinant = 0;
if ($this->rows == 1 && $this->columns == 1) {
$determinant = $this->matrix[0][0];
} elseif ($this->rows == 2 && $this->columns == 2) {
$determinant =
$this->matrix[0][0] * $this->matrix[1][1] -
$this->matrix[0][1] * $this->matrix[1][0];
} else {
for ($j = 0; $j < $this->columns; ++$j) {
$subMatrix = $this->crossOut(0, $j);
$minor = $this->matrix[0][$j] * $subMatrix->getDeterminant();
$determinant += fmod((float) $j, 2.0) == 0 ? $minor : -$minor;
}
}
return $determinant;
$lu = new LUDecomposition($this);
return $this->determinant = $lu->det();
}
/**
@ -303,21 +280,26 @@ class Matrix
throw MatrixException::notSquareMatrix();
}
if ($this->isSingular()) {
throw MatrixException::singularMatrix();
$LU = new LUDecomposition($this);
$identity = $this->getIdentity();
$inverse = $LU->solve($identity);
return new self($inverse, false);
}
/**
* Returns diagonal identity matrix of the same size of this matrix
*
* @return Matrix
*/
protected function getIdentity()
{
$array = array_fill(0, $this->rows, array_fill(0, $this->columns, 0));
for ($i=0; $i < $this->rows; $i++) {
$array[$i][$i] = 1;
}
$newMatrix = [];
for ($i = 0; $i < $this->rows; ++$i) {
for ($j = 0; $j < $this->columns; ++$j) {
$minor = $this->crossOut($i, $j)->getDeterminant();
$newMatrix[$i][$j] = fmod((float) ($i + $j), 2.0) == 0 ? $minor : -$minor;
}
}
$cofactorMatrix = new self($newMatrix, false);
return $cofactorMatrix->transpose()->divideByScalar($this->getDeterminant());
return new self($array, false);
}
/**

View File

@ -0,0 +1,65 @@
<?php
declare(strict_types=1);
namespace tests\DimensionReduction;
use Phpml\DimensionReduction\LDA;
use Phpml\Dataset\Demo\IrisDataset;
use PHPUnit\Framework\TestCase;
class LDATest extends TestCase
{
public function testLDA()
{
// Acceptable error
$epsilon = 0.001;
// IRIS dataset will be used to train LDA
$dataset = new IrisDataset();
$lda = new LDA(null, 2);
$transformed = $lda->fit($dataset->getSamples(), $dataset->getTargets());
// Some samples of the Iris data will be checked manually
// First 3 and last 3 rows from the original dataset
$data = [
[5.1, 3.5, 1.4, 0.2],
[4.9, 3.0, 1.4, 0.2],
[4.7, 3.2, 1.3, 0.2],
[6.5, 3.0, 5.2, 2.0],
[6.2, 3.4, 5.4, 2.3],
[5.9, 3.0, 5.1, 1.8]
];
$transformed2 = [
[-1.4922092756753, 1.9047102045574],
[-1.2576556684358, 1.608414450935],
[-1.3487505965419, 1.749846351699],
[1.7759343101456, 2.0371552314006],
[2.0059819019159, 2.4493123003226],
[1.701474913008, 1.9037880473772]
];
$control = [];
$control = array_merge($control, array_slice($transformed, 0, 3));
$control = array_merge($control, array_slice($transformed, -3));
$check = function ($row1, $row2) use ($epsilon) {
// Due to the fact that the sign of values can be flipped
// during the calculation of eigenValues, we have to compare
// absolute value of the values
$row1 = array_map('abs', $row1);
$row2 = array_map('abs', $row2);
$this->assertEquals($row1, $row2, '', $epsilon);
};
array_map($check, $control, $transformed2);
// Fitted LDA object should be able to return same values again
// for each projected row
foreach ($data as $i => $row) {
$newRow = [$transformed2[$i]];
$newRow2= $lda->transform($row);
array_map($check, $newRow, $newRow2);
}
}
}