mirror of
https://github.com/Llewellynvdm/php-ml.git
synced 2025-01-10 00:37:55 +00:00
Ensemble Classifiers : Bagging and RandomForest (#36)
* Fuzzy C-Means implementation * Update FuzzyCMeans * Rename FuzzyCMeans to FuzzyCMeans.php * Update NaiveBayes.php * Small fix applied to improve training performance array_unique is replaced with array_count_values+array_keys which is way faster * Revert "Small fix applied to improve training performance" This reverts commit c20253f16ac3e8c37d33ecaee28a87cc767e3b7f. * Revert "Revert "Small fix applied to improve training performance"" This reverts commit ea10e136c4c11b71609ccdcaf9999067e4be473e. * Revert "Small fix applied to improve training performance" This reverts commit c20253f16ac3e8c37d33ecaee28a87cc767e3b7f. * First DecisionTree implementation * Revert "First DecisionTree implementation" This reverts commit 4057a08679c26010c39040a48a3e6dad994a1a99. * DecisionTree * FCM Test * FCM Test * DecisionTree Test * Ensemble classifiers: Bagging and RandomForests * test * Fixes for conflicted files * Bagging and RandomForest ensemble algorithms * Changed unit test * Changed unit test * Changed unit test * Bagging and RandomForest ensemble algorithms * Baggging and RandomForest ensemble algorithms * Bagging and RandomForest ensemble algorithms RandomForest algorithm is improved with changes to original DecisionTree * Bagging and RandomForest ensemble algorithms * Slight fix about use of global Exception class * Fixed the error about wrong use of global Exception class * RandomForest code formatting
This commit is contained in:
parent
72b25ffd42
commit
1d73503958
@ -51,6 +51,11 @@ class DecisionTree implements Classifier
|
||||
*/
|
||||
public $actualDepth = 0;
|
||||
|
||||
/**
|
||||
* @var int
|
||||
*/
|
||||
private $numUsableFeatures = 0;
|
||||
|
||||
/**
|
||||
* @param int $maxDepth
|
||||
*/
|
||||
@ -144,15 +149,15 @@ class DecisionTree implements Classifier
|
||||
$samples = array_combine($records, $this->preprocess($samples));
|
||||
$bestGiniVal = 1;
|
||||
$bestSplit = null;
|
||||
for ($i=0; $i<$this->featureCount; $i++) {
|
||||
$features = $this->getSelectedFeatures();
|
||||
foreach ($features as $i) {
|
||||
$colValues = [];
|
||||
$baseValue = null;
|
||||
foreach ($samples as $index => $row) {
|
||||
$colValues[$index] = $row[$i];
|
||||
if ($baseValue === null) {
|
||||
$baseValue = $row[$i];
|
||||
}
|
||||
}
|
||||
$counts = array_count_values($colValues);
|
||||
arsort($counts);
|
||||
$baseValue = key($counts);
|
||||
$gini = $this->getGiniIndex($baseValue, $colValues, $targets);
|
||||
if ($bestSplit == null || $bestGiniVal > $gini) {
|
||||
$split = new DecisionTreeLeaf();
|
||||
@ -167,6 +172,27 @@ class DecisionTree implements Classifier
|
||||
return $bestSplit;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
protected function getSelectedFeatures()
|
||||
{
|
||||
$allFeatures = range(0, $this->featureCount - 1);
|
||||
if ($this->numUsableFeatures == 0) {
|
||||
return $allFeatures;
|
||||
}
|
||||
|
||||
$numFeatures = $this->numUsableFeatures;
|
||||
if ($numFeatures > $this->featureCount) {
|
||||
$numFeatures = $this->featureCount;
|
||||
}
|
||||
shuffle($allFeatures);
|
||||
$selectedFeatures = array_slice($allFeatures, 0, $numFeatures, false);
|
||||
sort($selectedFeatures);
|
||||
|
||||
return $selectedFeatures;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $baseValue
|
||||
* @param array $colValues
|
||||
@ -248,6 +274,27 @@ class DecisionTree implements Classifier
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* This method is used to set number of columns to be used
|
||||
* when deciding a split at an internal node of the tree. <br>
|
||||
* If the value is given 0, then all features are used (default behaviour),
|
||||
* otherwise the given value will be used as a maximum for number of columns
|
||||
* randomly selected for each split operation.
|
||||
*
|
||||
* @param int $numFeatures
|
||||
* @return $this
|
||||
* @throws Exception
|
||||
*/
|
||||
public function setNumFeatures(int $numFeatures)
|
||||
{
|
||||
if ($numFeatures < 0) {
|
||||
throw new \Exception("Selected column count should be greater or equal to zero");
|
||||
}
|
||||
|
||||
$this->numUsableFeatures = $numFeatures;
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string
|
||||
*/
|
||||
@ -273,6 +320,7 @@ class DecisionTree implements Classifier
|
||||
$node = $node->rightLeaf;
|
||||
}
|
||||
} while ($node);
|
||||
return $node->classValue;
|
||||
|
||||
return $node ? $node->classValue : $this->labels[0];
|
||||
}
|
||||
}
|
||||
|
@ -62,7 +62,7 @@ class DecisionTreeLeaf
|
||||
public function evaluate($record)
|
||||
{
|
||||
$recordField = $record[$this->columnIndex];
|
||||
if (preg_match("/^([<>=]{1,2})\s*(.*)/", $this->value, $matches)) {
|
||||
if (is_string($this->value) && preg_match("/^([<>=]{1,2})\s*(.*)/", $this->value, $matches)) {
|
||||
$op = $matches[1];
|
||||
$value= floatval($matches[2]);
|
||||
$recordField = strval($recordField);
|
||||
|
198
src/Phpml/Classification/Ensemble/Bagging.php
Normal file
198
src/Phpml/Classification/Ensemble/Bagging.php
Normal file
@ -0,0 +1,198 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\Classification\Ensemble;
|
||||
|
||||
use Phpml\Helper\Predictable;
|
||||
use Phpml\Helper\Trainable;
|
||||
use Phpml\Math\Statistic\Mean;
|
||||
use Phpml\Classification\Classifier;
|
||||
use Phpml\Classification\DecisionTree;
|
||||
use Phpml\Classification\NaiveBayes;
|
||||
|
||||
class Bagging implements Classifier
|
||||
{
|
||||
use Trainable, Predictable;
|
||||
|
||||
/**
|
||||
* @var int
|
||||
*/
|
||||
protected $numSamples;
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $targets = [];
|
||||
|
||||
/**
|
||||
* @var int
|
||||
*/
|
||||
protected $featureCount = 0;
|
||||
|
||||
/**
|
||||
* @var int
|
||||
*/
|
||||
protected $numClassifier;
|
||||
|
||||
/**
|
||||
* @var Classifier
|
||||
*/
|
||||
protected $classifier = DecisionTree::class;
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
protected $classifierOptions = ['depth' => 20];
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
protected $classifiers;
|
||||
|
||||
/**
|
||||
* @var float
|
||||
*/
|
||||
protected $subsetRatio = 0.5;
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $samples = [];
|
||||
|
||||
/**
|
||||
* Creates an ensemble classifier with given number of base classifiers<br>
|
||||
* Default number of base classifiers is 100.
|
||||
* The more number of base classifiers, the better performance but at the cost of procesing time
|
||||
*
|
||||
* @param int $numClassifier
|
||||
*/
|
||||
public function __construct($numClassifier = 50)
|
||||
{
|
||||
$this->numClassifier = $numClassifier;
|
||||
}
|
||||
|
||||
/**
|
||||
* This method determines the ratio of samples used to create the 'bootstrap' subset,
|
||||
* e.g., random samples drawn from the original dataset with replacement (allow repeats),
|
||||
* to train each base classifier.
|
||||
*
|
||||
* @param float $ratio
|
||||
* @return $this
|
||||
* @throws Exception
|
||||
*/
|
||||
public function setSubsetRatio(float $ratio)
|
||||
{
|
||||
if ($ratio < 0.1 || $ratio > 1.0) {
|
||||
throw new \Exception("Subset ratio should be between 0.1 and 1.0");
|
||||
}
|
||||
$this->subsetRatio = $ratio;
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* This method is used to set the base classifier. Default value is
|
||||
* DecisionTree::class, but any class that implements the <i>Classifier</i>
|
||||
* can be used. <br>
|
||||
* While giving the parameters of the classifier, the values should be
|
||||
* given in the order they are in the constructor of the classifier and parameter
|
||||
* names are neglected.
|
||||
*
|
||||
* @param string $classifier
|
||||
* @param array $classifierOptions
|
||||
* @return $this
|
||||
*/
|
||||
public function setClassifer(string $classifier, array $classifierOptions = [])
|
||||
{
|
||||
$this->classifier = $classifier;
|
||||
$this->classifierOptions = $classifierOptions;
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
* @param array $targets
|
||||
*/
|
||||
public function train(array $samples, array $targets)
|
||||
{
|
||||
$this->samples = array_merge($this->samples, $samples);
|
||||
$this->targets = array_merge($this->targets, $targets);
|
||||
$this->featureCount = count($samples[0]);
|
||||
$this->numSamples = count($this->samples);
|
||||
|
||||
// Init classifiers and train them with random sub-samples
|
||||
$this->classifiers = $this->initClassifiers();
|
||||
$index = 0;
|
||||
foreach ($this->classifiers as $classifier) {
|
||||
list($samples, $targets) = $this->getRandomSubset($index);
|
||||
$classifier->train($samples, $targets);
|
||||
++$index;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param int $index
|
||||
* @return array
|
||||
*/
|
||||
protected function getRandomSubset($index)
|
||||
{
|
||||
$subsetLength = (int)ceil(sqrt($this->numSamples));
|
||||
$denom = $this->subsetRatio / 2;
|
||||
$subsetLength = $this->numSamples / (1 / $denom);
|
||||
$index = $index * $subsetLength % $this->numSamples;
|
||||
$samples = [];
|
||||
$targets = [];
|
||||
for ($i=0; $i<$subsetLength * 2; $i++) {
|
||||
$rand = rand($index, $this->numSamples - 1);
|
||||
$samples[] = $this->samples[$rand];
|
||||
$targets[] = $this->targets[$rand];
|
||||
}
|
||||
return [$samples, $targets];
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
protected function initClassifiers()
|
||||
{
|
||||
$classifiers = [];
|
||||
for ($i=0; $i<$this->numClassifier; $i++) {
|
||||
$ref = new \ReflectionClass($this->classifier);
|
||||
if ($this->classifierOptions) {
|
||||
$obj = $ref->newInstanceArgs($this->classifierOptions);
|
||||
} else {
|
||||
$obj = $ref->newInstance();
|
||||
}
|
||||
$classifiers[] = $this->initSingleClassifier($obj, $i);
|
||||
}
|
||||
return $classifiers;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param Classifier $classifier
|
||||
* @param int $index
|
||||
* @return Classifier
|
||||
*/
|
||||
protected function initSingleClassifier($classifier, $index)
|
||||
{
|
||||
return $classifier;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $sample
|
||||
* @return mixed
|
||||
*/
|
||||
protected function predictSample(array $sample)
|
||||
{
|
||||
$predictions = [];
|
||||
foreach ($this->classifiers as $classifier) {
|
||||
/* @var $classifier Classifier */
|
||||
$predictions[] = $classifier->predict($sample);
|
||||
}
|
||||
|
||||
$counts = array_count_values($predictions);
|
||||
arsort($counts);
|
||||
reset($counts);
|
||||
return key($counts);
|
||||
}
|
||||
}
|
89
src/Phpml/Classification/Ensemble/RandomForest.php
Normal file
89
src/Phpml/Classification/Ensemble/RandomForest.php
Normal file
@ -0,0 +1,89 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\Classification\Ensemble;
|
||||
|
||||
use Phpml\Classification\Ensemble\Bagging;
|
||||
use Phpml\Classification\DecisionTree;
|
||||
use Phpml\Classification\NaiveBayes;
|
||||
use Phpml\Classification\Classifier;
|
||||
|
||||
class RandomForest extends Bagging
|
||||
{
|
||||
/**
|
||||
* @var float|string
|
||||
*/
|
||||
protected $featureSubsetRatio = 'log';
|
||||
|
||||
public function __construct($numClassifier = 50)
|
||||
{
|
||||
parent::__construct($numClassifier);
|
||||
|
||||
$this->setSubsetRatio(1.0);
|
||||
}
|
||||
|
||||
/**
|
||||
* This method is used to determine how much of the original columns (features)
|
||||
* will be used to construct subsets to train base classifiers.<br>
|
||||
*
|
||||
* Allowed values: 'sqrt', 'log' or any float number between 0.1 and 1.0 <br>
|
||||
*
|
||||
* If there are many features that diminishes classification performance, then
|
||||
* small values should be preferred, otherwise, with low number of features,
|
||||
* default value (0.7) will result in satisfactory performance.
|
||||
*
|
||||
* @param mixed $ratio string or float should be given
|
||||
* @return $this
|
||||
* @throws Exception
|
||||
*/
|
||||
public function setFeatureSubsetRatio($ratio)
|
||||
{
|
||||
if (is_float($ratio) && ($ratio < 0.1 || $ratio > 1.0)) {
|
||||
throw new \Exception("When a float given, feature subset ratio should be between 0.1 and 1.0");
|
||||
}
|
||||
if (is_string($ratio) && $ratio != 'sqrt' && $ratio != 'log') {
|
||||
throw new \Exception("When a string given, feature subset ratio can only be 'sqrt' or 'log' ");
|
||||
}
|
||||
$this->featureSubsetRatio = $ratio;
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* RandomForest algorithm is usable *only* with DecisionTree
|
||||
*
|
||||
* @param string $classifier
|
||||
* @param array $classifierOptions
|
||||
* @return $this
|
||||
*/
|
||||
public function setClassifer(string $classifier, array $classifierOptions = [])
|
||||
{
|
||||
if ($classifier != DecisionTree::class) {
|
||||
throw new \Exception("RandomForest can only use DecisionTree as base classifier");
|
||||
}
|
||||
|
||||
return parent::setClassifer($classifier, $classifierOptions);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param DecisionTree $classifier
|
||||
* @param int $index
|
||||
* @return DecisionTree
|
||||
*/
|
||||
protected function initSingleClassifier($classifier, $index)
|
||||
{
|
||||
if (is_float($this->featureSubsetRatio)) {
|
||||
$featureCount = (int)($this->featureSubsetRatio * $this->featureCount);
|
||||
} elseif ($this->featureCount == 'sqrt') {
|
||||
$featureCount = (int)sqrt($this->featureCount) + 1;
|
||||
} else {
|
||||
$featureCount = (int)log($this->featureCount, 2) + 1;
|
||||
}
|
||||
|
||||
if ($featureCount >= $this->featureCount) {
|
||||
$featureCount = $this->featureCount;
|
||||
}
|
||||
|
||||
return $classifier->setNumFeatures($featureCount);
|
||||
}
|
||||
}
|
127
tests/Phpml/Classification/Ensemble/BaggingTest.php
Normal file
127
tests/Phpml/Classification/Ensemble/BaggingTest.php
Normal file
@ -0,0 +1,127 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace tests\Classification\Ensemble;
|
||||
|
||||
use Phpml\Classification\Ensemble\Bagging;
|
||||
use Phpml\Classification\DecisionTree;
|
||||
use Phpml\Classification\NaiveBayes;
|
||||
use Phpml\Classification\KNearestNeighbors;
|
||||
use Phpml\ModelManager;
|
||||
use PHPUnit\Framework\TestCase;
|
||||
|
||||
class BaggingTest extends TestCase
|
||||
{
|
||||
private $data = [
|
||||
['sunny', 85, 85, 'false', 'Dont_play' ],
|
||||
['sunny', 80, 90, 'true', 'Dont_play' ],
|
||||
['overcast', 83, 78, 'false', 'Play' ],
|
||||
['rain', 70, 96, 'false', 'Play' ],
|
||||
['rain', 68, 80, 'false', 'Play' ],
|
||||
['rain', 65, 70, 'true', 'Dont_play' ],
|
||||
['overcast', 64, 65, 'true', 'Play' ],
|
||||
['sunny', 72, 95, 'false', 'Dont_play' ],
|
||||
['sunny', 69, 70, 'false', 'Play' ],
|
||||
['rain', 75, 80, 'false', 'Play' ],
|
||||
['sunny', 75, 70, 'true', 'Play' ],
|
||||
['overcast', 72, 90, 'true', 'Play' ],
|
||||
['overcast', 81, 75, 'false', 'Play' ],
|
||||
['rain', 71, 80, 'true', 'Dont_play' ]
|
||||
];
|
||||
|
||||
private $extraData = [
|
||||
['scorching', 90, 95, 'false', 'Dont_play'],
|
||||
['scorching', 0, 0, 'false', 'Dont_play'],
|
||||
];
|
||||
|
||||
public function testPredictSingleSample()
|
||||
{
|
||||
list($data, $targets) = $this->getData($this->data);
|
||||
$classifier = $this->getClassifier();
|
||||
// Testing with default options
|
||||
$classifier->train($data, $targets);
|
||||
$this->assertEquals('Dont_play', $classifier->predict(['sunny', 78, 72, 'false']));
|
||||
$this->assertEquals('Play', $classifier->predict(['overcast', 60, 60, 'false']));
|
||||
$this->assertEquals('Dont_play', $classifier->predict(['rain', 60, 60, 'true']));
|
||||
|
||||
list($data, $targets) = $this->getData($this->extraData);
|
||||
$classifier->train($data, $targets);
|
||||
$this->assertEquals('Dont_play', $classifier->predict(['scorching', 95, 90, 'true']));
|
||||
$this->assertEquals('Play', $classifier->predict(['overcast', 60, 60, 'false']));
|
||||
|
||||
return $classifier;
|
||||
}
|
||||
|
||||
public function testSaveAndRestore()
|
||||
{
|
||||
list($data, $targets) = $this->getData($this->data);
|
||||
$classifier = $this->getClassifier(5);
|
||||
$classifier->train($data, $targets);
|
||||
|
||||
$testSamples = [['sunny', 78, 72, 'false'], ['overcast', 60, 60, 'false']];
|
||||
$predicted = $classifier->predict($testSamples);
|
||||
|
||||
$filename = 'bagging-test-'.rand(100, 999).'-'.uniqid();
|
||||
$filepath = tempnam(sys_get_temp_dir(), $filename);
|
||||
$modelManager = new ModelManager();
|
||||
$modelManager->saveToFile($classifier, $filepath);
|
||||
|
||||
$restoredClassifier = $modelManager->restoreFromFile($filepath);
|
||||
$this->assertEquals($classifier, $restoredClassifier);
|
||||
$this->assertEquals($predicted, $restoredClassifier->predict($testSamples));
|
||||
}
|
||||
|
||||
public function testBaseClassifiers()
|
||||
{
|
||||
list($data, $targets) = $this->getData($this->data);
|
||||
$baseClassifiers = $this->getAvailableBaseClassifiers();
|
||||
|
||||
foreach ($baseClassifiers as $base => $params) {
|
||||
$classifier = $this->getClassifier();
|
||||
$classifier->setClassifer($base, $params);
|
||||
$classifier->train($data, $targets);
|
||||
|
||||
$baseClassifier = new $base(...array_values($params));
|
||||
$baseClassifier->train($data, $targets);
|
||||
$testData = [['sunny', 78, 72, 'false'], ['overcast', 60, 60, 'false'], ['rain', 60, 60, 'true']];
|
||||
foreach ($testData as $test) {
|
||||
$result = $classifier->predict($test);
|
||||
$baseResult = $classifier->predict($test);
|
||||
$this->assertEquals($result, $baseResult);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected function getClassifier($numBaseClassifiers = 50)
|
||||
{
|
||||
$classifier = new Bagging($numBaseClassifiers);
|
||||
$classifier->setSubsetRatio(1.0);
|
||||
$classifier->setClassifer(DecisionTree::class, ['depth' => 10]);
|
||||
return $classifier;
|
||||
}
|
||||
|
||||
protected function getAvailableBaseClassifiers()
|
||||
{
|
||||
return [
|
||||
DecisionTree::class => ['depth' => 5],
|
||||
NaiveBayes::class => []
|
||||
];
|
||||
}
|
||||
|
||||
private function getData($input)
|
||||
{
|
||||
// Populating input data to a size large enough
|
||||
// for base classifiers that they can work with a subset of it
|
||||
$populated = [];
|
||||
for ($i=0; $i<20; $i++) {
|
||||
$populated = array_merge($populated, $input);
|
||||
}
|
||||
shuffle($populated);
|
||||
$targets = array_column($populated, 4);
|
||||
array_walk($populated, function (&$v) {
|
||||
array_splice($v, 4, 1);
|
||||
});
|
||||
return [$populated, $targets];
|
||||
}
|
||||
}
|
38
tests/Phpml/Classification/Ensemble/RandomForestTest.php
Normal file
38
tests/Phpml/Classification/Ensemble/RandomForestTest.php
Normal file
@ -0,0 +1,38 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace tests\Classification\Ensemble;
|
||||
|
||||
use Phpml\Classification\Ensemble\RandomForest;
|
||||
use Phpml\Classification\DecisionTree;
|
||||
use Phpml\Classification\NaiveBayes;
|
||||
use Phpml\Classification\KNearestNeighbors;
|
||||
use Phpml\ModelManager;
|
||||
use tests\Classification\Ensemble\BaggingTest;
|
||||
|
||||
class RandomForestTest extends BaggingTest
|
||||
{
|
||||
protected function getClassifier($numBaseClassifiers = 50)
|
||||
{
|
||||
$classifier = new RandomForest($numBaseClassifiers);
|
||||
$classifier->setFeatureSubsetRatio('log');
|
||||
return $classifier;
|
||||
}
|
||||
|
||||
protected function getAvailableBaseClassifiers()
|
||||
{
|
||||
return [ DecisionTree::class => ['depth' => 5] ];
|
||||
}
|
||||
|
||||
public function testOtherBaseClassifier()
|
||||
{
|
||||
try {
|
||||
$classifier = new RandomForest();
|
||||
$classifier->setClassifer(NaiveBayes::class);
|
||||
$this->assertEquals(0, 1);
|
||||
} catch (\Exception $ex) {
|
||||
$this->assertEquals(1, 1);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user