diff --git a/README.md b/README.md index bab758c..b17ac72 100644 --- a/README.md +++ b/README.md @@ -88,8 +88,9 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples]( * [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/) * [Tf-idf Transformer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/tf-idf-transformer/) * Dimensionality Reduction - * PCA + * PCA (Principal Component Analysis) * Kernel PCA + * LDA (Linear Discriminant Analysis) * Datasets * [Array](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/array-dataset/) * [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/) diff --git a/src/Phpml/DimensionReduction/EigenTransformerBase.php b/src/Phpml/DimensionReduction/EigenTransformerBase.php new file mode 100644 index 0000000..b399002 --- /dev/null +++ b/src/Phpml/DimensionReduction/EigenTransformerBase.php @@ -0,0 +1,98 @@ +getRealEigenvalues(); + $eigVects= $eig->getEigenvectors(); + + $totalEigVal = array_sum($eigVals); + // Sort eigenvalues in descending order + arsort($eigVals); + + $explainedVar = 0.0; + $vectors = []; + $values = []; + foreach ($eigVals as $i => $eigVal) { + $explainedVar += $eigVal / $totalEigVal; + $vectors[] = $eigVects[$i]; + $values[] = $eigVal; + + if ($this->numFeatures !== null) { + if (count($vectors) == $this->numFeatures) { + break; + } + } else { + if ($explainedVar >= $this->totalVariance) { + break; + } + } + } + + $this->eigValues = $values; + $this->eigVectors = $vectors; + } + + /** + * Returns the reduced data + * + * @param array $data + * + * @return array + */ + protected function reduce(array $data) + { + $m1 = new Matrix($data); + $m2 = new Matrix($this->eigVectors); + + return $m1->multiply($m2->transpose())->toArray(); + } +} diff --git a/src/Phpml/DimensionReduction/KernelPCA.php b/src/Phpml/DimensionReduction/KernelPCA.php index 86070c7..51fb17a 100644 --- a/src/Phpml/DimensionReduction/KernelPCA.php +++ b/src/Phpml/DimensionReduction/KernelPCA.php @@ -86,7 +86,7 @@ class KernelPCA extends PCA $matrix = $this->calculateKernelMatrix($this->data, $numRows); $matrix = $this->centerMatrix($matrix, $numRows); - list($this->eigValues, $this->eigVectors) = $this->eigenDecomposition($matrix, $numRows); + $this->eigenDecomposition($matrix, $numRows); $this->fit = true; diff --git a/src/Phpml/DimensionReduction/LDA.php b/src/Phpml/DimensionReduction/LDA.php new file mode 100644 index 0000000..28f34d6 --- /dev/null +++ b/src/Phpml/DimensionReduction/LDA.php @@ -0,0 +1,247 @@ +
+ * The algorithm can be initialized by speciyfing + * either with the totalVariance(a value between 0.1 and 0.99) + * or numFeatures (number of features in the dataset) to be preserved. + * + * @param float|null $totalVariance Total explained variance to be preserved + * @param int|null $numFeatures Number of features to be preserved + * + * @throws \Exception + */ + public function __construct($totalVariance = null, $numFeatures = null) + { + if ($totalVariance !== null && ($totalVariance < 0.1 || $totalVariance > 0.99)) { + throw new \Exception("Total variance can be a value between 0.1 and 0.99"); + } + if ($numFeatures !== null && $numFeatures <= 0) { + throw new \Exception("Number of features to be preserved should be greater than 0"); + } + if ($totalVariance !== null && $numFeatures !== null) { + throw new \Exception("Either totalVariance or numFeatures should be specified in order to run the algorithm"); + } + + if ($numFeatures !== null) { + $this->numFeatures = $numFeatures; + } + if ($totalVariance !== null) { + $this->totalVariance = $totalVariance; + } + } + + /** + * Trains the algorithm to transform the given data to a lower dimensional space. + * + * @param array $data + * @param array $classes + * + * @return array + */ + public function fit(array $data, array $classes) : array + { + $this->labels = $this->getLabels($classes); + $this->means = $this->calculateMeans($data, $classes); + + $sW = $this->calculateClassVar($data, $classes); + $sB = $this->calculateClassCov(); + + $S = $sW->inverse()->multiply($sB); + $this->eigenDecomposition($S->toArray()); + + $this->fit = true; + + return $this->reduce($data); + } + + /** + * Returns unique labels in the dataset + * + * @param array $classes + * + * @return array + */ + protected function getLabels(array $classes): array + { + $counts = array_count_values($classes); + + return array_keys($counts); + } + + + /** + * Calculates mean of each column for each class and returns + * n by m matrix where n is number of labels and m is number of columns + * + * @param type $data + * @param type $classes + * + * @return array + */ + protected function calculateMeans($data, $classes) : array + { + $means = []; + $counts= []; + $overallMean = array_fill(0, count($data[0]), 0.0); + + foreach ($data as $index => $row) { + $label = array_search($classes[$index], $this->labels); + + foreach ($row as $col => $val) { + if (! isset($means[$label][$col])) { + $means[$label][$col] = 0.0; + } + $means[$label][$col] += $val; + $overallMean[$col] += $val; + } + + if (! isset($counts[$label])) { + $counts[$label] = 0; + } + $counts[$label]++; + } + + foreach ($means as $index => $row) { + foreach ($row as $col => $sum) { + $means[$index][$col] = $sum / $counts[$index]; + } + } + + // Calculate overall mean of the dataset for each column + $numElements = array_sum($counts); + $map = function ($el) use ($numElements) { + return $el / $numElements; + }; + $this->overallMean = array_map($map, $overallMean); + $this->counts = $counts; + + return $means; + } + + + /** + * Returns in-class scatter matrix for each class, which + * is a n by m matrix where n is number of classes and + * m is number of columns + * + * @param array $data + * @param array $classes + * + * @return Matrix + */ + protected function calculateClassVar($data, $classes) + { + // s is an n (number of classes) by m (number of column) matrix + $s = array_fill(0, count($data[0]), array_fill(0, count($data[0]), 0)); + $sW = new Matrix($s, false); + + foreach ($data as $index => $row) { + $label = array_search($classes[$index], $this->labels); + $means = $this->means[$label]; + + $row = $this->calculateVar($row, $means); + + $sW = $sW->add($row); + } + + return $sW; + } + + /** + * Returns between-class scatter matrix for each class, which + * is an n by m matrix where n is number of classes and + * m is number of columns + * + * @return Matrix + */ + protected function calculateClassCov() + { + // s is an n (number of classes) by m (number of column) matrix + $s = array_fill(0, count($this->overallMean), array_fill(0, count($this->overallMean), 0)); + $sB = new Matrix($s, false); + + foreach ($this->means as $index => $classMeans) { + $row = $this->calculateVar($classMeans, $this->overallMean); + $N = $this->counts[$index]; + $sB = $sB->add($row->multiplyByScalar($N)); + } + + return $sB; + } + + /** + * Returns the result of the calculation (x - m)T.(x - m) + * + * @param array $row + * @param array $means + * + * @return Matrix + */ + protected function calculateVar(array $row, array $means) + { + $x = new Matrix($row, false); + $m = new Matrix($means, false); + $diff = $x->subtract($m); + + return $diff->transpose()->multiply($diff); + } + + /** + * Transforms the given sample to a lower dimensional vector by using + * the eigenVectors obtained in the last run of fit. + * + * @param array $sample + * + * @return array + */ + public function transform(array $sample) + { + if (!$this->fit) { + throw new \Exception("LDA has not been fitted with respect to original dataset, please run LDA::fit() first"); + } + + if (! is_array($sample[0])) { + $sample = [$sample]; + } + + return $this->reduce($sample); + } +} diff --git a/src/Phpml/DimensionReduction/PCA.php b/src/Phpml/DimensionReduction/PCA.php index 422dae4..db2110d 100644 --- a/src/Phpml/DimensionReduction/PCA.php +++ b/src/Phpml/DimensionReduction/PCA.php @@ -4,27 +4,12 @@ declare(strict_types=1); namespace Phpml\DimensionReduction; -use Phpml\Math\LinearAlgebra\EigenvalueDecomposition; use Phpml\Math\Statistic\Covariance; use Phpml\Math\Statistic\Mean; use Phpml\Math\Matrix; -class PCA +class PCA extends EigenTransformerBase { - /** - * Total variance to be conserved after the reduction - * - * @var float - */ - public $totalVariance = 0.9; - - /** - * Number of features to be preserved after the reduction - * - * @var int - */ - public $numFeatures = null; - /** * Temporary storage for mean values for each dimension in given data * @@ -32,20 +17,6 @@ class PCA */ protected $means = []; - /** - * Eigenvectors of the covariance matrix - * - * @var array - */ - protected $eigVectors = []; - - /** - * Top eigenValues of the covariance matrix - * - * @var type - */ - protected $eigValues = []; - /** * @var bool */ @@ -100,7 +71,7 @@ class PCA $covMatrix = Covariance::covarianceMatrix($data, array_fill(0, $n, 0)); - list($this->eigValues, $this->eigVectors) = $this->eigenDecomposition($covMatrix, $n); + $this->eigenDecomposition($covMatrix); $this->fit = true; @@ -146,63 +117,6 @@ class PCA return $data; } - /** - * Calculates eigenValues and eigenVectors of the given matrix. Returns - * top eigenVectors along with the largest eigenValues. The total explained variance - * of these eigenVectors will be no less than desired $totalVariance value - * - * @param array $matrix - * @param int $n - * - * @return array - */ - protected function eigenDecomposition(array $matrix, int $n) - { - $eig = new EigenvalueDecomposition($matrix); - $eigVals = $eig->getRealEigenvalues(); - $eigVects= $eig->getEigenvectors(); - - $totalEigVal = array_sum($eigVals); - // Sort eigenvalues in descending order - arsort($eigVals); - - $explainedVar = 0.0; - $vectors = []; - $values = []; - foreach ($eigVals as $i => $eigVal) { - $explainedVar += $eigVal / $totalEigVal; - $vectors[] = $eigVects[$i]; - $values[] = $eigVal; - - if ($this->numFeatures !== null) { - if (count($vectors) == $this->numFeatures) { - break; - } - } else { - if ($explainedVar >= $this->totalVariance) { - break; - } - } - } - - return [$values, $vectors]; - } - - /** - * Returns the reduced data - * - * @param array $data - * - * @return array - */ - protected function reduce(array $data) - { - $m1 = new Matrix($data); - $m2 = new Matrix($this->eigVectors); - - return $m1->multiply($m2->transpose())->toArray(); - } - /** * Transforms the given sample to a lower dimensional vector by using * the eigenVectors obtained in the last run of fit. diff --git a/src/Phpml/Math/LinearAlgebra/EigenvalueDecomposition.php b/src/Phpml/Math/LinearAlgebra/EigenvalueDecomposition.php index 27557bb..5cbc121 100644 --- a/src/Phpml/Math/LinearAlgebra/EigenvalueDecomposition.php +++ b/src/Phpml/Math/LinearAlgebra/EigenvalueDecomposition.php @@ -130,10 +130,10 @@ class EigenvalueDecomposition $this->e[$j] = $g; } $f = 0.0; + if ($h === 0 || $h < 1e-32) { + $h = 1e-32; + } for ($j = 0; $j < $i; ++$j) { - if ($h === 0) { - $h = 1e-20; - } $this->e[$j] /= $h; $f += $this->e[$j] * $this->d[$j]; } diff --git a/src/Phpml/Math/LinearAlgebra/LUDecomposition.php b/src/Phpml/Math/LinearAlgebra/LUDecomposition.php new file mode 100644 index 0000000..1aeb239 --- /dev/null +++ b/src/Phpml/Math/LinearAlgebra/LUDecomposition.php @@ -0,0 +1,297 @@ += n, the LU decomposition is an m-by-n + * unit lower triangular matrix L, an n-by-n upper triangular matrix U, + * and a permutation vector piv of length m so that A(piv,:) = L*U. + * If m < n, then L is m-by-m and U is m-by-n. + * + * The LU decompostion with pivoting always exists, even if the matrix is + * singular, so the constructor will never fail. The primary use of the + * LU decomposition is in the solution of square systems of simultaneous + * linear equations. This will fail if isNonsingular() returns false. + * + * @author Paul Meagher + * @author Bartosz Matosiuk + * @author Michael Bommarito + * @version 1.1 + * @license PHP v3.0 + * + * Slightly changed to adapt the original code to PHP-ML library + * @date 2017/04/24 + * @author Mustafa Karabulut + */ + +namespace Phpml\Math\LinearAlgebra; + +use Phpml\Math\Matrix; +use Phpml\Exception\MatrixException; + +class LUDecomposition +{ + /** + * Decomposition storage + * @var array + */ + private $LU = []; + + /** + * Row dimension. + * @var int + */ + private $m; + + /** + * Column dimension. + * @var int + */ + private $n; + + /** + * Pivot sign. + * @var int + */ + private $pivsign; + + /** + * Internal storage of pivot vector. + * @var array + */ + private $piv = []; + + + /** + * LU Decomposition constructor. + * + * @param $A Rectangular matrix + * @return Structure to access L, U and piv. + */ + public function __construct(Matrix $A) + { + if ($A->getRows() != $A->getColumns()) { + throw MatrixException::notSquareMatrix(); + } + + // Use a "left-looking", dot-product, Crout/Doolittle algorithm. + $this->LU = $A->toArray(); + $this->m = $A->getRows(); + $this->n = $A->getColumns(); + for ($i = 0; $i < $this->m; ++$i) { + $this->piv[$i] = $i; + } + $this->pivsign = 1; + $LUrowi = $LUcolj = []; + + // Outer loop. + for ($j = 0; $j < $this->n; ++$j) { + // Make a copy of the j-th column to localize references. + for ($i = 0; $i < $this->m; ++$i) { + $LUcolj[$i] = &$this->LU[$i][$j]; + } + // Apply previous transformations. + for ($i = 0; $i < $this->m; ++$i) { + $LUrowi = $this->LU[$i]; + // Most of the time is spent in the following dot product. + $kmax = min($i, $j); + $s = 0.0; + for ($k = 0; $k < $kmax; ++$k) { + $s += $LUrowi[$k] * $LUcolj[$k]; + } + $LUrowi[$j] = $LUcolj[$i] -= $s; + } + // Find pivot and exchange if necessary. + $p = $j; + for ($i = $j+1; $i < $this->m; ++$i) { + if (abs($LUcolj[$i]) > abs($LUcolj[$p])) { + $p = $i; + } + } + if ($p != $j) { + for ($k = 0; $k < $this->n; ++$k) { + $t = $this->LU[$p][$k]; + $this->LU[$p][$k] = $this->LU[$j][$k]; + $this->LU[$j][$k] = $t; + } + $k = $this->piv[$p]; + $this->piv[$p] = $this->piv[$j]; + $this->piv[$j] = $k; + $this->pivsign = $this->pivsign * -1; + } + // Compute multipliers. + if (($j < $this->m) && ($this->LU[$j][$j] != 0.0)) { + for ($i = $j+1; $i < $this->m; ++$i) { + $this->LU[$i][$j] /= $this->LU[$j][$j]; + } + } + } + } // function __construct() + + + /** + * Get lower triangular factor. + * + * @return array Lower triangular factor + */ + public function getL() + { + $L = []; + for ($i = 0; $i < $this->m; ++$i) { + for ($j = 0; $j < $this->n; ++$j) { + if ($i > $j) { + $L[$i][$j] = $this->LU[$i][$j]; + } elseif ($i == $j) { + $L[$i][$j] = 1.0; + } else { + $L[$i][$j] = 0.0; + } + } + } + return new Matrix($L); + } // function getL() + + + /** + * Get upper triangular factor. + * + * @return array Upper triangular factor + */ + public function getU() + { + $U = []; + for ($i = 0; $i < $this->n; ++$i) { + for ($j = 0; $j < $this->n; ++$j) { + if ($i <= $j) { + $U[$i][$j] = $this->LU[$i][$j]; + } else { + $U[$i][$j] = 0.0; + } + } + } + return new Matrix($U); + } // function getU() + + + /** + * Return pivot permutation vector. + * + * @return array Pivot vector + */ + public function getPivot() + { + return $this->piv; + } // function getPivot() + + + /** + * Alias for getPivot + * + * @see getPivot + */ + public function getDoublePivot() + { + return $this->getPivot(); + } // function getDoublePivot() + + + /** + * Is the matrix nonsingular? + * + * @return true if U, and hence A, is nonsingular. + */ + public function isNonsingular() + { + for ($j = 0; $j < $this->n; ++$j) { + if ($this->LU[$j][$j] == 0) { + return false; + } + } + return true; + } // function isNonsingular() + + + /** + * Count determinants + * + * @return array d matrix deterninat + */ + public function det() + { + if ($this->m == $this->n) { + $d = $this->pivsign; + for ($j = 0; $j < $this->n; ++$j) { + $d *= $this->LU[$j][$j]; + } + return $d; + } else { + throw MatrixException::notSquareMatrix(); + } + } // function det() + + + /** + * Solve A*X = B + * + * @param Matrix $B A Matrix with as many rows as A and any number of columns. + * + * @return array X so that L*U*X = B(piv,:) + * + * @throws MatrixException + */ + public function solve(Matrix $B) + { + if ($B->getRows() != $this->m) { + throw MatrixException::notSquareMatrix(); + } + + if (! $this->isNonsingular()) { + throw MatrixException::singularMatrix(); + } + + // Copy right hand side with pivoting + $nx = $B->getColumns(); + $X = $this->getSubMatrix($B->toArray(), $this->piv, 0, $nx-1); + // Solve L*Y = B(piv,:) + for ($k = 0; $k < $this->n; ++$k) { + for ($i = $k+1; $i < $this->n; ++$i) { + for ($j = 0; $j < $nx; ++$j) { + $X[$i][$j] -= $X[$k][$j] * $this->LU[$i][$k]; + } + } + } + // Solve U*X = Y; + for ($k = $this->n-1; $k >= 0; --$k) { + for ($j = 0; $j < $nx; ++$j) { + $X[$k][$j] /= $this->LU[$k][$k]; + } + for ($i = 0; $i < $k; ++$i) { + for ($j = 0; $j < $nx; ++$j) { + $X[$i][$j] -= $X[$k][$j] * $this->LU[$i][$k]; + } + } + } + return $X; + } // function solve() + + /** + * @param Matrix $matrix + * @param int $j0 + * @param int $jF + * + * @return array + */ + protected function getSubMatrix(array $matrix, array $RL, int $j0, int $jF) + { + $m = count($RL); + $n = $jF - $j0; + $R = array_fill(0, $m, array_fill(0, $n+1, 0.0)); + + for ($i = 0; $i < $m; ++$i) { + for ($j = $j0; $j <= $jF; ++$j) { + $R[$i][$j - $j0]= $matrix[ $RL[$i] ][$j]; + } + } + + return $R; + } +} // class LUDecomposition diff --git a/src/Phpml/Math/Matrix.php b/src/Phpml/Math/Matrix.php index 25101f3..c996e7f 100644 --- a/src/Phpml/Math/Matrix.php +++ b/src/Phpml/Math/Matrix.php @@ -4,6 +4,7 @@ declare(strict_types=1); namespace Phpml\Math; +use Phpml\Math\LinearAlgebra\LUDecomposition; use Phpml\Exception\InvalidArgumentException; use Phpml\Exception\MatrixException; @@ -137,32 +138,8 @@ class Matrix throw MatrixException::notSquareMatrix(); } - return $this->determinant = $this->calculateDeterminant(); - } - - /** - * @return float|int - * - * @throws MatrixException - */ - private function calculateDeterminant() - { - $determinant = 0; - if ($this->rows == 1 && $this->columns == 1) { - $determinant = $this->matrix[0][0]; - } elseif ($this->rows == 2 && $this->columns == 2) { - $determinant = - $this->matrix[0][0] * $this->matrix[1][1] - - $this->matrix[0][1] * $this->matrix[1][0]; - } else { - for ($j = 0; $j < $this->columns; ++$j) { - $subMatrix = $this->crossOut(0, $j); - $minor = $this->matrix[0][$j] * $subMatrix->getDeterminant(); - $determinant += fmod((float) $j, 2.0) == 0 ? $minor : -$minor; - } - } - - return $determinant; + $lu = new LUDecomposition($this); + return $this->determinant = $lu->det(); } /** @@ -303,21 +280,26 @@ class Matrix throw MatrixException::notSquareMatrix(); } - if ($this->isSingular()) { - throw MatrixException::singularMatrix(); + $LU = new LUDecomposition($this); + $identity = $this->getIdentity(); + $inverse = $LU->solve($identity); + + return new self($inverse, false); + } + + /** + * Returns diagonal identity matrix of the same size of this matrix + * + * @return Matrix + */ + protected function getIdentity() + { + $array = array_fill(0, $this->rows, array_fill(0, $this->columns, 0)); + for ($i=0; $i < $this->rows; $i++) { + $array[$i][$i] = 1; } - $newMatrix = []; - for ($i = 0; $i < $this->rows; ++$i) { - for ($j = 0; $j < $this->columns; ++$j) { - $minor = $this->crossOut($i, $j)->getDeterminant(); - $newMatrix[$i][$j] = fmod((float) ($i + $j), 2.0) == 0 ? $minor : -$minor; - } - } - - $cofactorMatrix = new self($newMatrix, false); - - return $cofactorMatrix->transpose()->divideByScalar($this->getDeterminant()); + return new self($array, false); } /** diff --git a/tests/Phpml/DimensionReduction/LDATest.php b/tests/Phpml/DimensionReduction/LDATest.php new file mode 100644 index 0000000..5ebe018 --- /dev/null +++ b/tests/Phpml/DimensionReduction/LDATest.php @@ -0,0 +1,65 @@ +fit($dataset->getSamples(), $dataset->getTargets()); + + // Some samples of the Iris data will be checked manually + // First 3 and last 3 rows from the original dataset + $data = [ + [5.1, 3.5, 1.4, 0.2], + [4.9, 3.0, 1.4, 0.2], + [4.7, 3.2, 1.3, 0.2], + [6.5, 3.0, 5.2, 2.0], + [6.2, 3.4, 5.4, 2.3], + [5.9, 3.0, 5.1, 1.8] + ]; + $transformed2 = [ + [-1.4922092756753, 1.9047102045574], + [-1.2576556684358, 1.608414450935], + [-1.3487505965419, 1.749846351699], + [1.7759343101456, 2.0371552314006], + [2.0059819019159, 2.4493123003226], + [1.701474913008, 1.9037880473772] + ]; + + $control = []; + $control = array_merge($control, array_slice($transformed, 0, 3)); + $control = array_merge($control, array_slice($transformed, -3)); + + $check = function ($row1, $row2) use ($epsilon) { + // Due to the fact that the sign of values can be flipped + // during the calculation of eigenValues, we have to compare + // absolute value of the values + $row1 = array_map('abs', $row1); + $row2 = array_map('abs', $row2); + $this->assertEquals($row1, $row2, '', $epsilon); + }; + array_map($check, $control, $transformed2); + + // Fitted LDA object should be able to return same values again + // for each projected row + foreach ($data as $i => $row) { + $newRow = [$transformed2[$i]]; + $newRow2= $lda->transform($row); + + array_map($check, $newRow, $newRow2); + } + } +}