diff --git a/src/Phpml/Classification/DecisionTree.php b/src/Phpml/Classification/DecisionTree.php
index b2b4db3..0a70d2f 100644
--- a/src/Phpml/Classification/DecisionTree.php
+++ b/src/Phpml/Classification/DecisionTree.php
@@ -16,11 +16,6 @@ class DecisionTree implements Classifier
const CONTINUOS = 1;
const NOMINAL = 2;
- /**
- * @var array
- */
- private $samples = [];
-
/**
* @var array
*/
diff --git a/src/Phpml/Classification/Linear/DecisionStump.php b/src/Phpml/Classification/Linear/DecisionStump.php
index 1605a20..de86fe9 100644
--- a/src/Phpml/Classification/Linear/DecisionStump.php
+++ b/src/Phpml/Classification/Linear/DecisionStump.php
@@ -5,13 +5,13 @@ declare(strict_types=1);
namespace Phpml\Classification\Linear;
use Phpml\Helper\Predictable;
-use Phpml\Helper\Trainable;
+use Phpml\Helper\OneVsRest;
use Phpml\Classification\WeightedClassifier;
use Phpml\Classification\DecisionTree;
class DecisionStump extends WeightedClassifier
{
- use Trainable, Predictable;
+ use Predictable, OneVsRest;
const AUTO_SELECT = -1;
@@ -20,6 +20,10 @@ class DecisionStump extends WeightedClassifier
*/
protected $givenColumnIndex;
+ /**
+ * @var array
+ */
+ protected $binaryLabels;
/**
* Sample weights : If used the optimization on the decision value
@@ -57,10 +61,22 @@ class DecisionStump extends WeightedClassifier
*/
protected $columnTypes;
+ /**
+ * @var int
+ */
+ protected $featureCount;
+
/**
* @var float
*/
- protected $numSplitCount = 10.0;
+ protected $numSplitCount = 100.0;
+
+ /**
+ * Distribution of samples in the leaves
+ *
+ * @var array
+ */
+ protected $prob;
/**
* A DecisionStump classifier is a one-level deep DecisionTree. It is generally
@@ -81,20 +97,15 @@ class DecisionStump extends WeightedClassifier
* @param array $samples
* @param array $targets
*/
- public function train(array $samples, array $targets)
+ protected function trainBinary(array $samples, array $targets)
{
$this->samples = array_merge($this->samples, $samples);
$this->targets = array_merge($this->targets, $targets);
-
- // DecisionStump is capable of classifying between two classes only
- $labels = array_count_values($this->targets);
- $this->labels = array_keys($labels);
- if (count($this->labels) != 2) {
- throw new \Exception("DecisionStump can classify between two classes only:" . implode(',', $this->labels));
- }
+ $this->binaryLabels = array_keys(array_count_values($this->targets));
+ $this->featureCount = count($this->samples[0]);
// If a column index is given, it should be among the existing columns
- if ($this->givenColumnIndex > count($samples[0]) - 1) {
+ if ($this->givenColumnIndex > count($this->samples[0]) - 1) {
$this->givenColumnIndex = self::AUTO_SELECT;
}
@@ -106,7 +117,7 @@ class DecisionStump extends WeightedClassifier
throw new \Exception("Number of sample weights does not match with number of samples");
}
} else {
- $this->weights = array_fill(0, count($samples), 1);
+ $this->weights = array_fill(0, count($this->samples), 1);
}
// Determine type of each column as either "continuous" or "nominal"
@@ -114,14 +125,15 @@ class DecisionStump extends WeightedClassifier
// Try to find the best split in the columns of the dataset
// by calculating error rate for each split point in each column
- $columns = range(0, count($samples[0]) - 1);
+ $columns = range(0, count($this->samples[0]) - 1);
if ($this->givenColumnIndex != self::AUTO_SELECT) {
$columns = [$this->givenColumnIndex];
}
$bestSplit = [
'value' => 0, 'operator' => '',
- 'column' => 0, 'trainingErrorRate' => 1.0];
+ 'prob' => [], 'column' => 0,
+ 'trainingErrorRate' => 1.0];
foreach ($columns as $col) {
if ($this->columnTypes[$col] == DecisionTree::CONTINUOS) {
$split = $this->getBestNumericalSplit($col);
@@ -164,6 +176,10 @@ class DecisionStump extends WeightedClassifier
protected function getBestNumericalSplit(int $col)
{
$values = array_column($this->samples, $col);
+ // Trying all possible points may be accomplished in two general ways:
+ // 1- Try all values in the $samples array ($values)
+ // 2- Artificially split the range of values into several parts and try them
+ // We choose the second one because it is faster in larger datasets
$minValue = min($values);
$maxValue = max($values);
$stepSize = ($maxValue - $minValue) / $this->numSplitCount;
@@ -174,19 +190,21 @@ class DecisionStump extends WeightedClassifier
// Before trying all possible split points, let's first try
// the average value for the cut point
$threshold = array_sum($values) / (float) count($values);
- $errorRate = $this->calculateErrorRate($threshold, $operator, $values);
+ list($errorRate, $prob) = $this->calculateErrorRate($threshold, $operator, $values);
if ($split == null || $errorRate < $split['trainingErrorRate']) {
$split = ['value' => $threshold, 'operator' => $operator,
- 'column' => $col, 'trainingErrorRate' => $errorRate];
+ 'prob' => $prob, 'column' => $col,
+ 'trainingErrorRate' => $errorRate];
}
// Try other possible points one by one
for ($step = $minValue; $step <= $maxValue; $step+= $stepSize) {
$threshold = (float)$step;
- $errorRate = $this->calculateErrorRate($threshold, $operator, $values);
+ list($errorRate, $prob) = $this->calculateErrorRate($threshold, $operator, $values);
if ($errorRate < $split['trainingErrorRate']) {
$split = ['value' => $threshold, 'operator' => $operator,
- 'column' => $col, 'trainingErrorRate' => $errorRate];
+ 'prob' => $prob, 'column' => $col,
+ 'trainingErrorRate' => $errorRate];
}
}// for
}
@@ -210,11 +228,12 @@ class DecisionStump extends WeightedClassifier
foreach (['=', '!='] as $operator) {
foreach ($distinctVals as $val) {
- $errorRate = $this->calculateErrorRate($val, $operator, $values);
+ list($errorRate, $prob) = $this->calculateErrorRate($val, $operator, $values);
if ($split == null || $split['trainingErrorRate'] < $errorRate) {
$split = ['value' => $val, 'operator' => $operator,
- 'column' => $col, 'trainingErrorRate' => $errorRate];
+ 'prob' => $prob, 'column' => $col,
+ 'trainingErrorRate' => $errorRate];
}
}// for
}
@@ -238,9 +257,9 @@ class DecisionStump extends WeightedClassifier
case '>=': return $leftValue >= $rightValue;
case '<': return $leftValue < $rightValue;
case '<=': return $leftValue <= $rightValue;
- case '=': return $leftValue == $rightValue;
+ case '=': return $leftValue === $rightValue;
case '!=':
- case '<>': return $leftValue != $rightValue;
+ case '<>': return $leftValue !== $rightValue;
}
return false;
@@ -253,42 +272,90 @@ class DecisionStump extends WeightedClassifier
* @param float $threshold
* @param string $operator
* @param array $values
+ *
+ * @return array
*/
protected function calculateErrorRate(float $threshold, string $operator, array $values)
{
- $total = (float) array_sum($this->weights);
$wrong = 0.0;
- $leftLabel = $this->labels[0];
- $rightLabel= $this->labels[1];
+ $prob = [];
+ $leftLabel = $this->binaryLabels[0];
+ $rightLabel= $this->binaryLabels[1];
+
foreach ($values as $index => $value) {
- if ($this->evaluate($threshold, $operator, $value)) {
+ if ($this->evaluate($value, $operator, $threshold)) {
$predicted = $leftLabel;
} else {
$predicted = $rightLabel;
}
- if ($predicted != $this->targets[$index]) {
+ $target = $this->targets[$index];
+ if (strval($predicted) != strval($this->targets[$index])) {
$wrong += $this->weights[$index];
}
+
+ if (! isset($prob[$predicted][$target])) {
+ $prob[$predicted][$target] = 0;
+ }
+ $prob[$predicted][$target]++;
+ }
+
+ // Calculate probabilities: Proportion of labels in each leaf
+ $dist = array_combine($this->binaryLabels, array_fill(0, 2, 0.0));
+ foreach ($prob as $leaf => $counts) {
+ $leafTotal = (float)array_sum($prob[$leaf]);
+ foreach ($counts as $label => $count) {
+ if (strval($leaf) == strval($label)) {
+ $dist[$leaf] = $count / $leafTotal;
+ }
+ }
}
- return $wrong / $total;
+ return [$wrong / (float) array_sum($this->weights), $dist];
+ }
+
+ /**
+ * Returns the probability of the sample of belonging to the given label
+ *
+ * Probability of a sample is calculated as the proportion of the label
+ * within the labels of the training samples in the decision node
+ *
+ * @param array $sample
+ * @param mixed $label
+ *
+ * @return float
+ */
+ protected function predictProbability(array $sample, $label)
+ {
+ $predicted = $this->predictSampleBinary($sample);
+ if (strval($predicted) == strval($label)) {
+ return $this->prob[$label];
+ }
+
+ return 0.0;
}
/**
* @param array $sample
+ *
* @return mixed
*/
- protected function predictSample(array $sample)
+ protected function predictSampleBinary(array $sample)
{
- if ($this->evaluate($this->value, $this->operator, $sample[$this->column])) {
- return $this->labels[0];
+ if ($this->evaluate($sample[$this->column], $this->operator, $this->value)) {
+ return $this->binaryLabels[0];
}
- return $this->labels[1];
+
+ return $this->binaryLabels[1];
}
+ /**
+ * @return string
+ */
public function __toString()
{
- return "$this->column $this->operator $this->value";
+ return "IF $this->column $this->operator $this->value " .
+ "THEN " . $this->binaryLabels[0] . " ".
+ "ELSE " . $this->binaryLabels[1];
}
}
diff --git a/src/Phpml/Classification/Linear/Perceptron.php b/src/Phpml/Classification/Linear/Perceptron.php
index e2c684d..32e41f2 100644
--- a/src/Phpml/Classification/Linear/Perceptron.php
+++ b/src/Phpml/Classification/Linear/Perceptron.php
@@ -5,12 +5,13 @@ declare(strict_types=1);
namespace Phpml\Classification\Linear;
use Phpml\Helper\Predictable;
+use Phpml\Helper\OneVsRest;
use Phpml\Classification\Classifier;
use Phpml\Preprocessing\Normalizer;
class Perceptron implements Classifier
{
- use Predictable;
+ use Predictable, OneVsRest;
/**
* The function whose result will be used to calculate the network error
@@ -114,7 +115,7 @@ class Perceptron implements Classifier
* @param array $samples
* @param array $targets
*/
- public function train(array $samples, array $targets)
+ public function trainBinary(array $samples, array $targets)
{
$this->labels = array_keys(array_count_values($targets));
if (count($this->labels) > 2) {
@@ -128,7 +129,7 @@ class Perceptron implements Classifier
// Set all target values to either -1 or 1
$this->labels = [1 => $this->labels[0], -1 => $this->labels[1]];
foreach ($targets as $target) {
- $this->targets[] = $target == $this->labels[1] ? 1 : -1;
+ $this->targets[] = strval($target) == strval($this->labels[1]) ? 1 : -1;
}
// Set samples and feature count vars
@@ -213,6 +214,25 @@ class Perceptron implements Classifier
return false;
}
+ /**
+ * Checks if the sample should be normalized and if so, returns the
+ * normalized sample
+ *
+ * @param array $sample
+ *
+ * @return array
+ */
+ protected function checkNormalizedSample(array $sample)
+ {
+ if ($this->normalizer) {
+ $samples = [$sample];
+ $this->normalizer->transform($samples);
+ $sample = $samples[0];
+ }
+
+ return $sample;
+ }
+
/**
* Calculates net output of the network as a float value for the given input
*
@@ -244,17 +264,34 @@ class Perceptron implements Classifier
return $this->output($sample) > 0 ? 1 : -1;
}
+ /**
+ * Returns the probability of the sample of belonging to the given label.
+ *
+ * The probability is simply taken as the distance of the sample
+ * to the decision plane.
+ *
+ * @param array $sample
+ * @param mixed $label
+ */
+ protected function predictProbability(array $sample, $label)
+ {
+ $predicted = $this->predictSampleBinary($sample);
+
+ if (strval($predicted) == strval($label)) {
+ $sample = $this->checkNormalizedSample($sample);
+ return abs($this->output($sample));
+ }
+
+ return 0.0;
+ }
+
/**
* @param array $sample
* @return mixed
*/
- protected function predictSample(array $sample)
+ protected function predictSampleBinary(array $sample)
{
- if ($this->normalizer) {
- $samples = [$sample];
- $this->normalizer->transform($samples);
- $sample = $samples[0];
- }
+ $sample = $this->checkNormalizedSample($sample);
$predictedClass = $this->outputClass($sample);
diff --git a/src/Phpml/Helper/OneVsRest.php b/src/Phpml/Helper/OneVsRest.php
new file mode 100644
index 0000000..1288363
--- /dev/null
+++ b/src/Phpml/Helper/OneVsRest.php
@@ -0,0 +1,126 @@
+classifiers = [];
+
+ // If there are only two targets, then there is no need to perform OvR
+ $this->labels = array_keys(array_count_values($targets));
+ if (count($this->labels) == 2) {
+ $classifier->trainBinary($samples, $targets);
+ $this->classifiers[] = $classifier;
+ } else {
+ // Train a separate classifier for each label and memorize them
+ $this->samples = $samples;
+ $this->targets = $targets;
+ foreach ($this->labels as $label) {
+ $predictor = clone $classifier;
+ $targets = $this->binarizeTargets($label);
+ $predictor->trainBinary($samples, $targets);
+ $this->classifiers[$label] = $predictor;
+ }
+ }
+ }
+
+ /**
+ * Groups all targets into two groups: Targets equal to
+ * the given label and the others
+ *
+ * @param mixed $label
+ */
+ private function binarizeTargets($label)
+ {
+ $targets = [];
+
+ foreach ($this->targets as $target) {
+ $targets[] = $target == $label ? $label : "not_$label";
+ }
+
+ return $targets;
+ }
+
+
+ /**
+ * @param array $sample
+ *
+ * @return mixed
+ */
+ protected function predictSample(array $sample)
+ {
+ if (count($this->labels) == 2) {
+ return $this->classifiers[0]->predictSampleBinary($sample);
+ }
+
+ $probs = [];
+
+ foreach ($this->classifiers as $label => $predictor) {
+ $probs[$label] = $predictor->predictProbability($sample, $label);
+ }
+
+ arsort($probs, SORT_NUMERIC);
+ return key($probs);
+ }
+
+ /**
+ * Each classifier should implement this method instead of train(samples, targets)
+ *
+ * @param array $samples
+ * @param array $targets
+ */
+ abstract protected function trainBinary(array $samples, array $targets);
+
+ /**
+ * Each classifier that make use of OvR approach should be able to
+ * return a probability for a sample to belong to the given label.
+ *
+ * @param array $sample
+ *
+ * @return mixed
+ */
+ abstract protected function predictProbability(array $sample, string $label);
+
+ /**
+ * Each classifier should implement this method instead of predictSample()
+ *
+ * @param array $sample
+ *
+ * @return mixed
+ */
+ abstract protected function predictSampleBinary(array $sample);
+}
diff --git a/src/Phpml/Math/Statistic/Gaussian.php b/src/Phpml/Math/Statistic/Gaussian.php
new file mode 100644
index 0000000..df27f07
--- /dev/null
+++ b/src/Phpml/Math/Statistic/Gaussian.php
@@ -0,0 +1,60 @@
+mean = $mean;
+ $this->std = $std;
+ }
+
+ /**
+ * Returns probability density of the given $value
+ *
+ * @param float $value
+ *
+ * @return type
+ */
+ public function pdf(float $value)
+ {
+ // Calculate the probability density by use of normal/Gaussian distribution
+ // Ref: https://en.wikipedia.org/wiki/Normal_distribution
+ $std2 = $this->std ** 2;
+ $mean = $this->mean;
+ return exp(- (($value - $mean) ** 2) / (2 * $std2)) / sqrt(2 * $std2 * pi());
+ }
+
+ /**
+ * Returns probability density value of the given $value based on
+ * given standard deviation and the mean
+ *
+ * @param float $mean
+ * @param float $std
+ * @param float $value
+ *
+ * @return float
+ */
+ public static function distributionPdf(float $mean, float $std, float $value)
+ {
+ $normal = new self($mean, $std);
+ return $normal->pdf($value);
+ }
+}
diff --git a/tests/Phpml/Classification/Linear/AdalineTest.php b/tests/Phpml/Classification/Linear/AdalineTest.php
index 7ea63ab..c07fbd2 100644
--- a/tests/Phpml/Classification/Linear/AdalineTest.php
+++ b/tests/Phpml/Classification/Linear/AdalineTest.php
@@ -30,6 +30,21 @@ class AdalineTest extends TestCase
$this->assertEquals(1, $classifier->predict([0.1, 0.99]));
$this->assertEquals(1, $classifier->predict([1.1, 0.8]));
+ // By use of One-v-Rest, Adaline can perform multi-class classification
+ // The samples should be separable by lines perpendicular to the dimensions
+ $samples = [
+ [0, 0], [0, 1], [1, 0], [1, 1], // First group : a cluster at bottom-left corner in 2D
+ [5, 5], [6, 5], [5, 6], [7, 5], // Second group: another cluster at the middle-right
+ [3, 10],[3, 10],[3, 8], [3, 9] // Third group : cluster at the top-middle
+ ];
+ $targets = [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2];
+
+ $classifier = new Adaline();
+ $classifier->train($samples, $targets);
+ $this->assertEquals(0, $classifier->predict([0.5, 0.5]));
+ $this->assertEquals(1, $classifier->predict([6.0, 5.0]));
+ $this->assertEquals(2, $classifier->predict([3.0, 9.5]));
+
return $classifier;
}
diff --git a/tests/Phpml/Classification/Linear/DecisionStumpTest.php b/tests/Phpml/Classification/Linear/DecisionStumpTest.php
index f83e095..4060ce3 100644
--- a/tests/Phpml/Classification/Linear/DecisionStumpTest.php
+++ b/tests/Phpml/Classification/Linear/DecisionStumpTest.php
@@ -12,8 +12,9 @@ class DecisionStumpTest extends TestCase
{
public function testPredictSingleSample()
{
- // Samples should be separable with a line perpendicular to any dimension
- // given in the dataset
+ // Samples should be separable with a line perpendicular
+ // to any dimension given in the dataset
+ //
// First: horizontal test
$samples = [[0, 0], [1, 0], [0, 1], [1, 1]];
$targets = [0, 0, 1, 1];
@@ -34,6 +35,21 @@ class DecisionStumpTest extends TestCase
$this->assertEquals(1, $classifier->predict([1.0, 0.99]));
$this->assertEquals(1, $classifier->predict([1.1, 0.1]));
+ // By use of One-v-Rest, DecisionStump can perform multi-class classification
+ // The samples should be separable by lines perpendicular to the dimensions
+ $samples = [
+ [0, 0], [0, 1], [1, 0], [1, 1], // First group : a cluster at bottom-left corner in 2D
+ [5, 5], [6, 5], [5, 6], [7, 5], // Second group: another cluster at the middle-right
+ [3, 10],[3, 10],[3, 8], [3, 9] // Third group : cluster at the top-middle
+ ];
+ $targets = [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2];
+
+ $classifier = new DecisionStump();
+ $classifier->train($samples, $targets);
+ $this->assertEquals(0, $classifier->predict([0.5, 0.5]));
+ $this->assertEquals(1, $classifier->predict([6.0, 5.0]));
+ $this->assertEquals(2, $classifier->predict([3.5, 9.5]));
+
return $classifier;
}
diff --git a/tests/Phpml/Classification/Linear/PerceptronTest.php b/tests/Phpml/Classification/Linear/PerceptronTest.php
index 64954f7..ef820fc 100644
--- a/tests/Phpml/Classification/Linear/PerceptronTest.php
+++ b/tests/Phpml/Classification/Linear/PerceptronTest.php
@@ -30,6 +30,21 @@ class PerceptronTest extends TestCase
$this->assertEquals(1, $classifier->predict([0.1, 0.99]));
$this->assertEquals(1, $classifier->predict([1.1, 0.8]));
+ // By use of One-v-Rest, Perceptron can perform multi-class classification
+ // The samples should be separable by lines perpendicular to the dimensions
+ $samples = [
+ [0, 0], [0, 1], [1, 0], [1, 1], // First group : a cluster at bottom-left corner in 2D
+ [5, 5], [6, 5], [5, 6], [7, 5], // Second group: another cluster at the middle-right
+ [3, 10],[3, 10],[3, 8], [3, 9] // Third group : cluster at the top-middle
+ ];
+ $targets = [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2];
+
+ $classifier = new Perceptron();
+ $classifier->train($samples, $targets);
+ $this->assertEquals(0, $classifier->predict([0.5, 0.5]));
+ $this->assertEquals(1, $classifier->predict([6.0, 5.0]));
+ $this->assertEquals(2, $classifier->predict([3.0, 9.5]));
+
return $classifier;
}
diff --git a/tests/Phpml/Math/Statistic/GaussianTest.php b/tests/Phpml/Math/Statistic/GaussianTest.php
new file mode 100644
index 0000000..6bbf63b
--- /dev/null
+++ b/tests/Phpml/Math/Statistic/GaussianTest.php
@@ -0,0 +1,28 @@
+ $v) {
+ $this->assertEquals($pdf[$i], $g->pdf($v), '', $delta);
+
+ $this->assertEquals($pdf[$i], Gaussian::distributionPdf($mean, $std, $v), '', $delta);
+ }
+ }
+}