Ensemble Classifiers : Bagging and RandomForest (#36)

* Fuzzy C-Means implementation

* Update FuzzyCMeans

* Rename FuzzyCMeans to FuzzyCMeans.php

* Update NaiveBayes.php

* Small fix applied to improve training performance

array_unique is replaced with array_count_values+array_keys which is way
faster

* Revert "Small fix applied to improve training performance"

This reverts commit c20253f16ac3e8c37d33ecaee28a87cc767e3b7f.

* Revert "Revert "Small fix applied to improve training performance""

This reverts commit ea10e136c4c11b71609ccdcaf9999067e4be473e.

* Revert "Small fix applied to improve training performance"

This reverts commit c20253f16ac3e8c37d33ecaee28a87cc767e3b7f.

* First DecisionTree implementation

* Revert "First DecisionTree implementation"

This reverts commit 4057a08679c26010c39040a48a3e6dad994a1a99.

* DecisionTree

* FCM Test

* FCM Test

* DecisionTree Test

* Ensemble classifiers: Bagging and RandomForests

* test

* Fixes for conflicted files

* Bagging and RandomForest ensemble algorithms

* Changed unit test

* Changed unit test

* Changed unit test

* Bagging and RandomForest ensemble algorithms

* Baggging and RandomForest ensemble algorithms

* Bagging and RandomForest ensemble algorithms

RandomForest algorithm is improved with changes to original DecisionTree

* Bagging and RandomForest ensemble algorithms

* Slight fix about use of global Exception class

* Fixed the error about wrong use of global Exception class

* RandomForest code formatting
This commit is contained in:
Mustafa Karabulut 2017-02-07 13:37:56 +02:00 committed by Arkadiusz Kondas
parent 72b25ffd42
commit 1d73503958
6 changed files with 507 additions and 7 deletions

View File

@ -51,6 +51,11 @@ class DecisionTree implements Classifier
*/ */
public $actualDepth = 0; public $actualDepth = 0;
/**
* @var int
*/
private $numUsableFeatures = 0;
/** /**
* @param int $maxDepth * @param int $maxDepth
*/ */
@ -144,15 +149,15 @@ class DecisionTree implements Classifier
$samples = array_combine($records, $this->preprocess($samples)); $samples = array_combine($records, $this->preprocess($samples));
$bestGiniVal = 1; $bestGiniVal = 1;
$bestSplit = null; $bestSplit = null;
for ($i=0; $i<$this->featureCount; $i++) { $features = $this->getSelectedFeatures();
foreach ($features as $i) {
$colValues = []; $colValues = [];
$baseValue = null;
foreach ($samples as $index => $row) { foreach ($samples as $index => $row) {
$colValues[$index] = $row[$i]; $colValues[$index] = $row[$i];
if ($baseValue === null) {
$baseValue = $row[$i];
}
} }
$counts = array_count_values($colValues);
arsort($counts);
$baseValue = key($counts);
$gini = $this->getGiniIndex($baseValue, $colValues, $targets); $gini = $this->getGiniIndex($baseValue, $colValues, $targets);
if ($bestSplit == null || $bestGiniVal > $gini) { if ($bestSplit == null || $bestGiniVal > $gini) {
$split = new DecisionTreeLeaf(); $split = new DecisionTreeLeaf();
@ -167,6 +172,27 @@ class DecisionTree implements Classifier
return $bestSplit; return $bestSplit;
} }
/**
* @return array
*/
protected function getSelectedFeatures()
{
$allFeatures = range(0, $this->featureCount - 1);
if ($this->numUsableFeatures == 0) {
return $allFeatures;
}
$numFeatures = $this->numUsableFeatures;
if ($numFeatures > $this->featureCount) {
$numFeatures = $this->featureCount;
}
shuffle($allFeatures);
$selectedFeatures = array_slice($allFeatures, 0, $numFeatures, false);
sort($selectedFeatures);
return $selectedFeatures;
}
/** /**
* @param string $baseValue * @param string $baseValue
* @param array $colValues * @param array $colValues
@ -248,6 +274,27 @@ class DecisionTree implements Classifier
return false; return false;
} }
/**
* This method is used to set number of columns to be used
* when deciding a split at an internal node of the tree. <br>
* If the value is given 0, then all features are used (default behaviour),
* otherwise the given value will be used as a maximum for number of columns
* randomly selected for each split operation.
*
* @param int $numFeatures
* @return $this
* @throws Exception
*/
public function setNumFeatures(int $numFeatures)
{
if ($numFeatures < 0) {
throw new \Exception("Selected column count should be greater or equal to zero");
}
$this->numUsableFeatures = $numFeatures;
return $this;
}
/** /**
* @return string * @return string
*/ */
@ -273,6 +320,7 @@ class DecisionTree implements Classifier
$node = $node->rightLeaf; $node = $node->rightLeaf;
} }
} while ($node); } while ($node);
return $node->classValue;
return $node ? $node->classValue : $this->labels[0];
} }
} }

View File

@ -62,7 +62,7 @@ class DecisionTreeLeaf
public function evaluate($record) public function evaluate($record)
{ {
$recordField = $record[$this->columnIndex]; $recordField = $record[$this->columnIndex];
if (preg_match("/^([<>=]{1,2})\s*(.*)/", $this->value, $matches)) { if (is_string($this->value) && preg_match("/^([<>=]{1,2})\s*(.*)/", $this->value, $matches)) {
$op = $matches[1]; $op = $matches[1];
$value= floatval($matches[2]); $value= floatval($matches[2]);
$recordField = strval($recordField); $recordField = strval($recordField);

View File

@ -0,0 +1,198 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification\Ensemble;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
use Phpml\Math\Statistic\Mean;
use Phpml\Classification\Classifier;
use Phpml\Classification\DecisionTree;
use Phpml\Classification\NaiveBayes;
class Bagging implements Classifier
{
use Trainable, Predictable;
/**
* @var int
*/
protected $numSamples;
/**
* @var array
*/
private $targets = [];
/**
* @var int
*/
protected $featureCount = 0;
/**
* @var int
*/
protected $numClassifier;
/**
* @var Classifier
*/
protected $classifier = DecisionTree::class;
/**
* @var array
*/
protected $classifierOptions = ['depth' => 20];
/**
* @var array
*/
protected $classifiers;
/**
* @var float
*/
protected $subsetRatio = 0.5;
/**
* @var array
*/
private $samples = [];
/**
* Creates an ensemble classifier with given number of base classifiers<br>
* Default number of base classifiers is 100.
* The more number of base classifiers, the better performance but at the cost of procesing time
*
* @param int $numClassifier
*/
public function __construct($numClassifier = 50)
{
$this->numClassifier = $numClassifier;
}
/**
* This method determines the ratio of samples used to create the 'bootstrap' subset,
* e.g., random samples drawn from the original dataset with replacement (allow repeats),
* to train each base classifier.
*
* @param float $ratio
* @return $this
* @throws Exception
*/
public function setSubsetRatio(float $ratio)
{
if ($ratio < 0.1 || $ratio > 1.0) {
throw new \Exception("Subset ratio should be between 0.1 and 1.0");
}
$this->subsetRatio = $ratio;
return $this;
}
/**
* This method is used to set the base classifier. Default value is
* DecisionTree::class, but any class that implements the <i>Classifier</i>
* can be used. <br>
* While giving the parameters of the classifier, the values should be
* given in the order they are in the constructor of the classifier and parameter
* names are neglected.
*
* @param string $classifier
* @param array $classifierOptions
* @return $this
*/
public function setClassifer(string $classifier, array $classifierOptions = [])
{
$this->classifier = $classifier;
$this->classifierOptions = $classifierOptions;
return $this;
}
/**
* @param array $samples
* @param array $targets
*/
public function train(array $samples, array $targets)
{
$this->samples = array_merge($this->samples, $samples);
$this->targets = array_merge($this->targets, $targets);
$this->featureCount = count($samples[0]);
$this->numSamples = count($this->samples);
// Init classifiers and train them with random sub-samples
$this->classifiers = $this->initClassifiers();
$index = 0;
foreach ($this->classifiers as $classifier) {
list($samples, $targets) = $this->getRandomSubset($index);
$classifier->train($samples, $targets);
++$index;
}
}
/**
* @param int $index
* @return array
*/
protected function getRandomSubset($index)
{
$subsetLength = (int)ceil(sqrt($this->numSamples));
$denom = $this->subsetRatio / 2;
$subsetLength = $this->numSamples / (1 / $denom);
$index = $index * $subsetLength % $this->numSamples;
$samples = [];
$targets = [];
for ($i=0; $i<$subsetLength * 2; $i++) {
$rand = rand($index, $this->numSamples - 1);
$samples[] = $this->samples[$rand];
$targets[] = $this->targets[$rand];
}
return [$samples, $targets];
}
/**
* @return array
*/
protected function initClassifiers()
{
$classifiers = [];
for ($i=0; $i<$this->numClassifier; $i++) {
$ref = new \ReflectionClass($this->classifier);
if ($this->classifierOptions) {
$obj = $ref->newInstanceArgs($this->classifierOptions);
} else {
$obj = $ref->newInstance();
}
$classifiers[] = $this->initSingleClassifier($obj, $i);
}
return $classifiers;
}
/**
* @param Classifier $classifier
* @param int $index
* @return Classifier
*/
protected function initSingleClassifier($classifier, $index)
{
return $classifier;
}
/**
* @param array $sample
* @return mixed
*/
protected function predictSample(array $sample)
{
$predictions = [];
foreach ($this->classifiers as $classifier) {
/* @var $classifier Classifier */
$predictions[] = $classifier->predict($sample);
}
$counts = array_count_values($predictions);
arsort($counts);
reset($counts);
return key($counts);
}
}

View File

@ -0,0 +1,89 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification\Ensemble;
use Phpml\Classification\Ensemble\Bagging;
use Phpml\Classification\DecisionTree;
use Phpml\Classification\NaiveBayes;
use Phpml\Classification\Classifier;
class RandomForest extends Bagging
{
/**
* @var float|string
*/
protected $featureSubsetRatio = 'log';
public function __construct($numClassifier = 50)
{
parent::__construct($numClassifier);
$this->setSubsetRatio(1.0);
}
/**
* This method is used to determine how much of the original columns (features)
* will be used to construct subsets to train base classifiers.<br>
*
* Allowed values: 'sqrt', 'log' or any float number between 0.1 and 1.0 <br>
*
* If there are many features that diminishes classification performance, then
* small values should be preferred, otherwise, with low number of features,
* default value (0.7) will result in satisfactory performance.
*
* @param mixed $ratio string or float should be given
* @return $this
* @throws Exception
*/
public function setFeatureSubsetRatio($ratio)
{
if (is_float($ratio) && ($ratio < 0.1 || $ratio > 1.0)) {
throw new \Exception("When a float given, feature subset ratio should be between 0.1 and 1.0");
}
if (is_string($ratio) && $ratio != 'sqrt' && $ratio != 'log') {
throw new \Exception("When a string given, feature subset ratio can only be 'sqrt' or 'log' ");
}
$this->featureSubsetRatio = $ratio;
return $this;
}
/**
* RandomForest algorithm is usable *only* with DecisionTree
*
* @param string $classifier
* @param array $classifierOptions
* @return $this
*/
public function setClassifer(string $classifier, array $classifierOptions = [])
{
if ($classifier != DecisionTree::class) {
throw new \Exception("RandomForest can only use DecisionTree as base classifier");
}
return parent::setClassifer($classifier, $classifierOptions);
}
/**
* @param DecisionTree $classifier
* @param int $index
* @return DecisionTree
*/
protected function initSingleClassifier($classifier, $index)
{
if (is_float($this->featureSubsetRatio)) {
$featureCount = (int)($this->featureSubsetRatio * $this->featureCount);
} elseif ($this->featureCount == 'sqrt') {
$featureCount = (int)sqrt($this->featureCount) + 1;
} else {
$featureCount = (int)log($this->featureCount, 2) + 1;
}
if ($featureCount >= $this->featureCount) {
$featureCount = $this->featureCount;
}
return $classifier->setNumFeatures($featureCount);
}
}

View File

@ -0,0 +1,127 @@
<?php
declare(strict_types=1);
namespace tests\Classification\Ensemble;
use Phpml\Classification\Ensemble\Bagging;
use Phpml\Classification\DecisionTree;
use Phpml\Classification\NaiveBayes;
use Phpml\Classification\KNearestNeighbors;
use Phpml\ModelManager;
use PHPUnit\Framework\TestCase;
class BaggingTest extends TestCase
{
private $data = [
['sunny', 85, 85, 'false', 'Dont_play' ],
['sunny', 80, 90, 'true', 'Dont_play' ],
['overcast', 83, 78, 'false', 'Play' ],
['rain', 70, 96, 'false', 'Play' ],
['rain', 68, 80, 'false', 'Play' ],
['rain', 65, 70, 'true', 'Dont_play' ],
['overcast', 64, 65, 'true', 'Play' ],
['sunny', 72, 95, 'false', 'Dont_play' ],
['sunny', 69, 70, 'false', 'Play' ],
['rain', 75, 80, 'false', 'Play' ],
['sunny', 75, 70, 'true', 'Play' ],
['overcast', 72, 90, 'true', 'Play' ],
['overcast', 81, 75, 'false', 'Play' ],
['rain', 71, 80, 'true', 'Dont_play' ]
];
private $extraData = [
['scorching', 90, 95, 'false', 'Dont_play'],
['scorching', 0, 0, 'false', 'Dont_play'],
];
public function testPredictSingleSample()
{
list($data, $targets) = $this->getData($this->data);
$classifier = $this->getClassifier();
// Testing with default options
$classifier->train($data, $targets);
$this->assertEquals('Dont_play', $classifier->predict(['sunny', 78, 72, 'false']));
$this->assertEquals('Play', $classifier->predict(['overcast', 60, 60, 'false']));
$this->assertEquals('Dont_play', $classifier->predict(['rain', 60, 60, 'true']));
list($data, $targets) = $this->getData($this->extraData);
$classifier->train($data, $targets);
$this->assertEquals('Dont_play', $classifier->predict(['scorching', 95, 90, 'true']));
$this->assertEquals('Play', $classifier->predict(['overcast', 60, 60, 'false']));
return $classifier;
}
public function testSaveAndRestore()
{
list($data, $targets) = $this->getData($this->data);
$classifier = $this->getClassifier(5);
$classifier->train($data, $targets);
$testSamples = [['sunny', 78, 72, 'false'], ['overcast', 60, 60, 'false']];
$predicted = $classifier->predict($testSamples);
$filename = 'bagging-test-'.rand(100, 999).'-'.uniqid();
$filepath = tempnam(sys_get_temp_dir(), $filename);
$modelManager = new ModelManager();
$modelManager->saveToFile($classifier, $filepath);
$restoredClassifier = $modelManager->restoreFromFile($filepath);
$this->assertEquals($classifier, $restoredClassifier);
$this->assertEquals($predicted, $restoredClassifier->predict($testSamples));
}
public function testBaseClassifiers()
{
list($data, $targets) = $this->getData($this->data);
$baseClassifiers = $this->getAvailableBaseClassifiers();
foreach ($baseClassifiers as $base => $params) {
$classifier = $this->getClassifier();
$classifier->setClassifer($base, $params);
$classifier->train($data, $targets);
$baseClassifier = new $base(...array_values($params));
$baseClassifier->train($data, $targets);
$testData = [['sunny', 78, 72, 'false'], ['overcast', 60, 60, 'false'], ['rain', 60, 60, 'true']];
foreach ($testData as $test) {
$result = $classifier->predict($test);
$baseResult = $classifier->predict($test);
$this->assertEquals($result, $baseResult);
}
}
}
protected function getClassifier($numBaseClassifiers = 50)
{
$classifier = new Bagging($numBaseClassifiers);
$classifier->setSubsetRatio(1.0);
$classifier->setClassifer(DecisionTree::class, ['depth' => 10]);
return $classifier;
}
protected function getAvailableBaseClassifiers()
{
return [
DecisionTree::class => ['depth' => 5],
NaiveBayes::class => []
];
}
private function getData($input)
{
// Populating input data to a size large enough
// for base classifiers that they can work with a subset of it
$populated = [];
for ($i=0; $i<20; $i++) {
$populated = array_merge($populated, $input);
}
shuffle($populated);
$targets = array_column($populated, 4);
array_walk($populated, function (&$v) {
array_splice($v, 4, 1);
});
return [$populated, $targets];
}
}

View File

@ -0,0 +1,38 @@
<?php
declare(strict_types=1);
namespace tests\Classification\Ensemble;
use Phpml\Classification\Ensemble\RandomForest;
use Phpml\Classification\DecisionTree;
use Phpml\Classification\NaiveBayes;
use Phpml\Classification\KNearestNeighbors;
use Phpml\ModelManager;
use tests\Classification\Ensemble\BaggingTest;
class RandomForestTest extends BaggingTest
{
protected function getClassifier($numBaseClassifiers = 50)
{
$classifier = new RandomForest($numBaseClassifiers);
$classifier->setFeatureSubsetRatio('log');
return $classifier;
}
protected function getAvailableBaseClassifiers()
{
return [ DecisionTree::class => ['depth' => 5] ];
}
public function testOtherBaseClassifier()
{
try {
$classifier = new RandomForest();
$classifier->setClassifer(NaiveBayes::class);
$this->assertEquals(0, 1);
} catch (\Exception $ex) {
$this->assertEquals(1, 1);
}
}
}