php-ml/src/DimensionReduction/KernelPCA.php

235 lines
7.0 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
declare(strict_types=1);
namespace Phpml\DimensionReduction;
use Closure;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Exception\InvalidOperationException;
use Phpml\Math\Distance\Euclidean;
use Phpml\Math\Distance\Manhattan;
use Phpml\Math\Matrix;
class KernelPCA extends PCA
{
public const KERNEL_RBF = 1;
public const KERNEL_SIGMOID = 2;
public const KERNEL_LAPLACIAN = 3;
public const KERNEL_LINEAR = 4;
/**
* Selected kernel function
*
* @var int
*/
protected $kernel;
/**
* Gamma value used by the kernel
*
* @var float|null
*/
protected $gamma;
/**
* Original dataset used to fit KernelPCA
*
* @var array
*/
protected $data = [];
/**
* Kernel principal component analysis (KernelPCA) is an extension of PCA using
* techniques of kernel methods. It is more suitable for data that involves
* vectors that are not linearly separable<br><br>
* Example: <b>$kpca = new KernelPCA(KernelPCA::KERNEL_RBF, null, 2, 15.0);</b>
* will initialize the algorithm with an RBF kernel having the gamma parameter as 15,0. <br>
* This transformation will return the same number of rows with only <i>2</i> columns.
*
* @param float $totalVariance Total variance to be preserved if numFeatures is not given
* @param int $numFeatures Number of columns to be returned
* @param float $gamma Gamma parameter is used with RBF and Sigmoid kernels
*
* @throws InvalidArgumentException
*/
public function __construct(int $kernel = self::KERNEL_RBF, ?float $totalVariance = null, ?int $numFeatures = null, ?float $gamma = null)
{
if (!in_array($kernel, [self::KERNEL_RBF, self::KERNEL_SIGMOID, self::KERNEL_LAPLACIAN, self::KERNEL_LINEAR], true)) {
throw new InvalidArgumentException('KernelPCA can be initialized with the following kernels only: Linear, RBF, Sigmoid and Laplacian');
}
parent::__construct($totalVariance, $numFeatures);
$this->kernel = $kernel;
$this->gamma = $gamma;
}
/**
* Takes a data and returns a lower dimensional version
* of this data while preserving $totalVariance or $numFeatures. <br>
* $data is an n-by-m matrix and returned array is
* n-by-k matrix where k <= m
*/
public function fit(array $data): array
{
$numRows = count($data);
$this->data = $data;
if ($this->gamma === null) {
$this->gamma = 1.0 / $numRows;
}
$matrix = $this->calculateKernelMatrix($this->data, $numRows);
$matrix = $this->centerMatrix($matrix, $numRows);
$this->eigenDecomposition($matrix);
$this->fit = true;
return Matrix::transposeArray($this->eigVectors);
}
/**
* Transforms the given sample to a lower dimensional vector by using
* the variables obtained during the last run of <code>fit</code>.
*
* @throws InvalidArgumentException
* @throws InvalidOperationException
*/
public function transform(array $sample): array
{
if (!$this->fit) {
throw new InvalidOperationException('KernelPCA has not been fitted with respect to original dataset, please run KernelPCA::fit() first');
}
if (is_array($sample[0])) {
throw new InvalidArgumentException('KernelPCA::transform() accepts only one-dimensional arrays');
}
$pairs = $this->getDistancePairs($sample);
return $this->projectSample($pairs);
}
/**
* Calculates similarity matrix by use of selected kernel function<br>
* An n-by-m matrix is given and an n-by-n matrix is returned
*/
protected function calculateKernelMatrix(array $data, int $numRows): array
{
$kernelFunc = $this->getKernel();
$matrix = [];
for ($i = 0; $i < $numRows; ++$i) {
for ($k = 0; $k < $numRows; ++$k) {
if ($i <= $k) {
$matrix[$i][$k] = $kernelFunc($data[$i], $data[$k]);
} else {
$matrix[$i][$k] = $matrix[$k][$i];
}
}
}
return $matrix;
}
/**
* Kernel matrix is centered in its original space by using the following
* conversion:
*
* K = K N.K K.N + N.K.N where N is n-by-n matrix filled with 1/n
*/
protected function centerMatrix(array $matrix, int $n): array
{
$N = array_fill(0, $n, array_fill(0, $n, 1.0 / $n));
$N = new Matrix($N, false);
$K = new Matrix($matrix, false);
// K.N (This term is repeated so we cache it once)
$K_N = $K->multiply($N);
// N.K
$N_K = $N->multiply($K);
// N.K.N
$N_K_N = $N->multiply($K_N);
return $K->subtract($N_K)
->subtract($K_N)
->add($N_K_N)
->toArray();
}
/**
* Returns the callable kernel function
*
* @throws \Exception
*/
protected function getKernel(): Closure
{
switch ($this->kernel) {
case self::KERNEL_LINEAR:
// k(x,y) = xT.y
return function ($x, $y) {
return Matrix::dot($x, $y)[0];
};
case self::KERNEL_RBF:
// k(x,y)=exp(-γ.|x-y|) where |..| is Euclidean distance
$dist = new Euclidean();
return function ($x, $y) use ($dist): float {
return exp(-$this->gamma * $dist->sqDistance($x, $y));
};
case self::KERNEL_SIGMOID:
// k(x,y)=tanh(γ.xT.y+c0) where c0=1
return function ($x, $y): float {
$res = Matrix::dot($x, $y)[0] + 1.0;
return tanh((float) $this->gamma * $res);
};
case self::KERNEL_LAPLACIAN:
// k(x,y)=exp(-γ.|x-y|) where |..| is Manhattan distance
$dist = new Manhattan();
return function ($x, $y) use ($dist): float {
return exp(-$this->gamma * $dist->distance($x, $y));
};
default:
// Not reached
throw new InvalidArgumentException(sprintf('KernelPCA initialized with invalid kernel: %d', $this->kernel));
}
}
protected function getDistancePairs(array $sample): array
{
$kernel = $this->getKernel();
$pairs = [];
foreach ($this->data as $row) {
$pairs[] = $kernel($row, $sample);
}
return $pairs;
}
protected function projectSample(array $pairs): array
{
// Normalize eigenvectors by eig = eigVectors / eigValues
$func = function ($eigVal, $eigVect) {
$m = new Matrix($eigVect, false);
$a = $m->divideByScalar($eigVal)->toArray();
return $a[0];
};
$eig = array_map($func, $this->eigValues, $this->eigVectors);
// return k.dot(eig)
return Matrix::dot($pairs, $eig);
}
}