2017-04-23 07:03:30 +00:00
|
|
|
<?php
|
|
|
|
|
|
|
|
declare(strict_types=1);
|
|
|
|
|
|
|
|
namespace Phpml\Math\Statistic;
|
|
|
|
|
|
|
|
use Phpml\Exception\InvalidArgumentException;
|
|
|
|
|
|
|
|
class Covariance
|
|
|
|
{
|
|
|
|
/**
|
|
|
|
* Calculates covariance from two given arrays, x and y, respectively
|
|
|
|
*
|
|
|
|
* @throws InvalidArgumentException
|
|
|
|
*/
|
2017-11-06 07:56:37 +00:00
|
|
|
public static function fromXYArrays(array $x, array $y, bool $sample = true, float $meanX = null, float $meanY = null) : float
|
2017-04-23 07:03:30 +00:00
|
|
|
{
|
|
|
|
if (empty($x) || empty($y)) {
|
|
|
|
throw InvalidArgumentException::arrayCantBeEmpty();
|
|
|
|
}
|
|
|
|
|
|
|
|
$n = count($x);
|
|
|
|
if ($sample && $n === 1) {
|
|
|
|
throw InvalidArgumentException::arraySizeToSmall(2);
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($meanX === null) {
|
|
|
|
$meanX = Mean::arithmetic($x);
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($meanY === null) {
|
|
|
|
$meanY = Mean::arithmetic($y);
|
|
|
|
}
|
|
|
|
|
|
|
|
$sum = 0.0;
|
|
|
|
foreach ($x as $index => $xi) {
|
|
|
|
$yi = $y[$index];
|
|
|
|
$sum += ($xi - $meanX) * ($yi - $meanY);
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($sample) {
|
|
|
|
--$n;
|
|
|
|
}
|
|
|
|
|
|
|
|
return $sum / $n;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Calculates covariance of two dimensions, i and k in the given data.
|
|
|
|
*
|
2017-05-17 07:03:25 +00:00
|
|
|
* @throws InvalidArgumentException
|
|
|
|
* @throws \Exception
|
2017-04-23 07:03:30 +00:00
|
|
|
*/
|
2017-11-06 07:56:37 +00:00
|
|
|
public static function fromDataset(array $data, int $i, int $k, bool $sample = true, float $meanX = null, float $meanY = null) : float
|
2017-04-23 07:03:30 +00:00
|
|
|
{
|
|
|
|
if (empty($data)) {
|
|
|
|
throw InvalidArgumentException::arrayCantBeEmpty();
|
|
|
|
}
|
|
|
|
|
|
|
|
$n = count($data);
|
|
|
|
if ($sample && $n === 1) {
|
|
|
|
throw InvalidArgumentException::arraySizeToSmall(2);
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($i < 0 || $k < 0 || $i >= $n || $k >= $n) {
|
2017-08-17 06:50:37 +00:00
|
|
|
throw new \Exception('Given indices i and k do not match with the dimensionality of data');
|
2017-04-23 07:03:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if ($meanX === null || $meanY === null) {
|
|
|
|
$x = array_column($data, $i);
|
|
|
|
$y = array_column($data, $k);
|
|
|
|
|
|
|
|
$meanX = Mean::arithmetic($x);
|
|
|
|
$meanY = Mean::arithmetic($y);
|
|
|
|
$sum = 0.0;
|
|
|
|
foreach ($x as $index => $xi) {
|
|
|
|
$yi = $y[$index];
|
|
|
|
$sum += ($xi - $meanX) * ($yi - $meanY);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// In the case, whole dataset given along with dimension indices, i and k,
|
|
|
|
// we would like to avoid getting column data with array_column and operate
|
|
|
|
// over this extra copy of column data for memory efficiency purposes.
|
|
|
|
//
|
|
|
|
// Instead we traverse through the whole data and get what we actually need
|
|
|
|
// without copying the data. This way, memory use will be reduced
|
|
|
|
// with a slight cost of CPU utilization.
|
|
|
|
$sum = 0.0;
|
|
|
|
foreach ($data as $row) {
|
|
|
|
$val = [];
|
|
|
|
foreach ($row as $index => $col) {
|
|
|
|
if ($index == $i) {
|
|
|
|
$val[0] = $col - $meanX;
|
|
|
|
}
|
|
|
|
if ($index == $k) {
|
|
|
|
$val[1] = $col - $meanY;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
$sum += $val[0] * $val[1];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($sample) {
|
|
|
|
--$n;
|
|
|
|
}
|
|
|
|
|
|
|
|
return $sum / $n;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns the covariance matrix of n-dimensional data
|
|
|
|
*
|
2017-05-17 07:03:25 +00:00
|
|
|
* @param array|null $means
|
2017-04-23 07:03:30 +00:00
|
|
|
*/
|
2017-11-06 07:56:37 +00:00
|
|
|
public static function covarianceMatrix(array $data, array $means = null) : array
|
2017-04-23 07:03:30 +00:00
|
|
|
{
|
|
|
|
$n = count($data[0]);
|
|
|
|
|
|
|
|
if ($means === null) {
|
|
|
|
$means = [];
|
2017-05-17 07:03:25 +00:00
|
|
|
for ($i = 0; $i < $n; ++$i) {
|
2017-04-23 07:03:30 +00:00
|
|
|
$means[] = Mean::arithmetic(array_column($data, $i));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$cov = [];
|
2017-05-17 07:03:25 +00:00
|
|
|
for ($i = 0; $i < $n; ++$i) {
|
|
|
|
for ($k = 0; $k < $n; ++$k) {
|
2017-04-23 07:03:30 +00:00
|
|
|
if ($i > $k) {
|
|
|
|
$cov[$i][$k] = $cov[$k][$i];
|
|
|
|
} else {
|
2017-05-17 07:03:25 +00:00
|
|
|
$cov[$i][$k] = self::fromDataset(
|
2017-07-26 06:24:47 +00:00
|
|
|
$data,
|
|
|
|
$i,
|
|
|
|
$k,
|
|
|
|
true,
|
|
|
|
$means[$i],
|
|
|
|
$means[$k]
|
2017-05-17 07:03:25 +00:00
|
|
|
);
|
2017-04-23 07:03:30 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return $cov;
|
|
|
|
}
|
|
|
|
}
|