2017-04-23 07:03:30 +00:00
|
|
|
<?php
|
|
|
|
|
|
|
|
declare(strict_types=1);
|
|
|
|
|
|
|
|
namespace Phpml\DimensionReduction;
|
|
|
|
|
2018-03-06 22:26:36 +00:00
|
|
|
use Phpml\Exception\InvalidArgumentException;
|
|
|
|
use Phpml\Exception\InvalidOperationException;
|
2017-04-23 07:03:30 +00:00
|
|
|
use Phpml\Math\Statistic\Covariance;
|
|
|
|
use Phpml\Math\Statistic\Mean;
|
|
|
|
|
2017-04-25 06:58:02 +00:00
|
|
|
class PCA extends EigenTransformerBase
|
2017-04-23 07:03:30 +00:00
|
|
|
{
|
|
|
|
/**
|
|
|
|
* Temporary storage for mean values for each dimension in given data
|
|
|
|
*
|
|
|
|
* @var array
|
|
|
|
*/
|
|
|
|
protected $means = [];
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @var bool
|
|
|
|
*/
|
|
|
|
protected $fit = false;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* PCA (Principal Component Analysis) used to explain given
|
|
|
|
* data with lower number of dimensions. This analysis transforms the
|
|
|
|
* data to a lower dimensional version of it by conserving a proportion of total variance
|
|
|
|
* within the data. It is a lossy data compression technique.<br>
|
|
|
|
*
|
|
|
|
* @param float $totalVariance Total explained variance to be preserved
|
2017-08-17 06:50:37 +00:00
|
|
|
* @param int $numFeatures Number of features to be preserved
|
2017-04-23 07:03:30 +00:00
|
|
|
*
|
2018-03-06 22:26:36 +00:00
|
|
|
* @throws InvalidArgumentException
|
2017-04-23 07:03:30 +00:00
|
|
|
*/
|
2017-11-14 20:21:23 +00:00
|
|
|
public function __construct(?float $totalVariance = null, ?int $numFeatures = null)
|
2017-04-23 07:03:30 +00:00
|
|
|
{
|
|
|
|
if ($totalVariance !== null && ($totalVariance < 0.1 || $totalVariance > 0.99)) {
|
2018-03-06 22:26:36 +00:00
|
|
|
throw new InvalidArgumentException('Total variance can be a value between 0.1 and 0.99');
|
2017-04-23 07:03:30 +00:00
|
|
|
}
|
2017-11-22 21:16:10 +00:00
|
|
|
|
2017-04-23 07:03:30 +00:00
|
|
|
if ($numFeatures !== null && $numFeatures <= 0) {
|
2018-03-06 22:26:36 +00:00
|
|
|
throw new InvalidArgumentException('Number of features to be preserved should be greater than 0');
|
2017-04-23 07:03:30 +00:00
|
|
|
}
|
2017-11-22 21:16:10 +00:00
|
|
|
|
2018-03-06 22:26:36 +00:00
|
|
|
if (($totalVariance !== null) === ($numFeatures !== null)) {
|
|
|
|
throw new InvalidArgumentException('Either totalVariance or numFeatures should be specified in order to run the algorithm');
|
2017-04-23 07:03:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if ($numFeatures !== null) {
|
|
|
|
$this->numFeatures = $numFeatures;
|
|
|
|
}
|
2017-11-22 21:16:10 +00:00
|
|
|
|
2017-04-23 07:03:30 +00:00
|
|
|
if ($totalVariance !== null) {
|
|
|
|
$this->totalVariance = $totalVariance;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Takes a data and returns a lower dimensional version
|
|
|
|
* of this data while preserving $totalVariance or $numFeatures. <br>
|
|
|
|
* $data is an n-by-m matrix and returned array is
|
|
|
|
* n-by-k matrix where k <= m
|
|
|
|
*/
|
2017-11-22 21:16:10 +00:00
|
|
|
public function fit(array $data): array
|
2017-04-23 07:03:30 +00:00
|
|
|
{
|
|
|
|
$n = count($data[0]);
|
|
|
|
|
|
|
|
$data = $this->normalize($data, $n);
|
|
|
|
|
|
|
|
$covMatrix = Covariance::covarianceMatrix($data, array_fill(0, $n, 0));
|
|
|
|
|
2017-04-25 06:58:02 +00:00
|
|
|
$this->eigenDecomposition($covMatrix);
|
2017-04-23 07:03:30 +00:00
|
|
|
|
|
|
|
$this->fit = true;
|
|
|
|
|
|
|
|
return $this->reduce($data);
|
|
|
|
}
|
|
|
|
|
2017-11-22 21:16:10 +00:00
|
|
|
/**
|
|
|
|
* Transforms the given sample to a lower dimensional vector by using
|
|
|
|
* the eigenVectors obtained in the last run of <code>fit</code>.
|
|
|
|
*
|
2018-03-06 22:26:36 +00:00
|
|
|
* @throws InvalidOperationException
|
2017-11-22 21:16:10 +00:00
|
|
|
*/
|
|
|
|
public function transform(array $sample): array
|
|
|
|
{
|
|
|
|
if (!$this->fit) {
|
2018-03-06 22:26:36 +00:00
|
|
|
throw new InvalidOperationException('PCA has not been fitted with respect to original dataset, please run PCA::fit() first');
|
2017-11-22 21:16:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!is_array($sample[0])) {
|
|
|
|
$sample = [$sample];
|
|
|
|
}
|
|
|
|
|
|
|
|
$sample = $this->normalize($sample, count($sample[0]));
|
|
|
|
|
|
|
|
return $this->reduce($sample);
|
|
|
|
}
|
|
|
|
|
2017-11-14 20:21:23 +00:00
|
|
|
protected function calculateMeans(array $data, int $n): void
|
2017-04-23 07:03:30 +00:00
|
|
|
{
|
|
|
|
// Calculate means for each dimension
|
|
|
|
$this->means = [];
|
2017-05-17 07:03:25 +00:00
|
|
|
for ($i = 0; $i < $n; ++$i) {
|
2017-04-23 07:03:30 +00:00
|
|
|
$column = array_column($data, $i);
|
|
|
|
$this->means[] = Mean::arithmetic($column);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Normalization of the data includes subtracting mean from
|
|
|
|
* each dimension therefore dimensions will be centered to zero
|
|
|
|
*/
|
2017-11-22 21:16:10 +00:00
|
|
|
protected function normalize(array $data, int $n): array
|
2017-04-23 07:03:30 +00:00
|
|
|
{
|
2018-10-28 06:44:52 +00:00
|
|
|
if (count($this->means) === 0) {
|
2017-04-23 07:03:30 +00:00
|
|
|
$this->calculateMeans($data, $n);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Normalize data
|
2018-10-16 19:42:06 +00:00
|
|
|
foreach (array_keys($data) as $i) {
|
2017-05-17 07:03:25 +00:00
|
|
|
for ($k = 0; $k < $n; ++$k) {
|
2017-04-23 07:03:30 +00:00
|
|
|
$data[$i][$k] -= $this->means[$k];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return $data;
|
|
|
|
}
|
|
|
|
}
|