From 1d73503958b477a5d166e69f7df314db2660f537 Mon Sep 17 00:00:00 2001 From: Mustafa Karabulut Date: Tue, 7 Feb 2017 13:37:56 +0200 Subject: [PATCH] Ensemble Classifiers : Bagging and RandomForest (#36) * Fuzzy C-Means implementation * Update FuzzyCMeans * Rename FuzzyCMeans to FuzzyCMeans.php * Update NaiveBayes.php * Small fix applied to improve training performance array_unique is replaced with array_count_values+array_keys which is way faster * Revert "Small fix applied to improve training performance" This reverts commit c20253f16ac3e8c37d33ecaee28a87cc767e3b7f. * Revert "Revert "Small fix applied to improve training performance"" This reverts commit ea10e136c4c11b71609ccdcaf9999067e4be473e. * Revert "Small fix applied to improve training performance" This reverts commit c20253f16ac3e8c37d33ecaee28a87cc767e3b7f. * First DecisionTree implementation * Revert "First DecisionTree implementation" This reverts commit 4057a08679c26010c39040a48a3e6dad994a1a99. * DecisionTree * FCM Test * FCM Test * DecisionTree Test * Ensemble classifiers: Bagging and RandomForests * test * Fixes for conflicted files * Bagging and RandomForest ensemble algorithms * Changed unit test * Changed unit test * Changed unit test * Bagging and RandomForest ensemble algorithms * Baggging and RandomForest ensemble algorithms * Bagging and RandomForest ensemble algorithms RandomForest algorithm is improved with changes to original DecisionTree * Bagging and RandomForest ensemble algorithms * Slight fix about use of global Exception class * Fixed the error about wrong use of global Exception class * RandomForest code formatting --- src/Phpml/Classification/DecisionTree.php | 60 +++++- .../DecisionTree/DecisionTreeLeaf.php | 2 +- src/Phpml/Classification/Ensemble/Bagging.php | 198 ++++++++++++++++++ .../Classification/Ensemble/RandomForest.php | 89 ++++++++ .../Classification/Ensemble/BaggingTest.php | 127 +++++++++++ .../Ensemble/RandomForestTest.php | 38 ++++ 6 files changed, 507 insertions(+), 7 deletions(-) create mode 100644 src/Phpml/Classification/Ensemble/Bagging.php create mode 100644 src/Phpml/Classification/Ensemble/RandomForest.php create mode 100644 tests/Phpml/Classification/Ensemble/BaggingTest.php create mode 100644 tests/Phpml/Classification/Ensemble/RandomForestTest.php diff --git a/src/Phpml/Classification/DecisionTree.php b/src/Phpml/Classification/DecisionTree.php index 1a39cbe..1a04802 100644 --- a/src/Phpml/Classification/DecisionTree.php +++ b/src/Phpml/Classification/DecisionTree.php @@ -51,6 +51,11 @@ class DecisionTree implements Classifier */ public $actualDepth = 0; + /** + * @var int + */ + private $numUsableFeatures = 0; + /** * @param int $maxDepth */ @@ -144,15 +149,15 @@ class DecisionTree implements Classifier $samples = array_combine($records, $this->preprocess($samples)); $bestGiniVal = 1; $bestSplit = null; - for ($i=0; $i<$this->featureCount; $i++) { + $features = $this->getSelectedFeatures(); + foreach ($features as $i) { $colValues = []; - $baseValue = null; foreach ($samples as $index => $row) { $colValues[$index] = $row[$i]; - if ($baseValue === null) { - $baseValue = $row[$i]; - } } + $counts = array_count_values($colValues); + arsort($counts); + $baseValue = key($counts); $gini = $this->getGiniIndex($baseValue, $colValues, $targets); if ($bestSplit == null || $bestGiniVal > $gini) { $split = new DecisionTreeLeaf(); @@ -167,6 +172,27 @@ class DecisionTree implements Classifier return $bestSplit; } + /** + * @return array + */ + protected function getSelectedFeatures() + { + $allFeatures = range(0, $this->featureCount - 1); + if ($this->numUsableFeatures == 0) { + return $allFeatures; + } + + $numFeatures = $this->numUsableFeatures; + if ($numFeatures > $this->featureCount) { + $numFeatures = $this->featureCount; + } + shuffle($allFeatures); + $selectedFeatures = array_slice($allFeatures, 0, $numFeatures, false); + sort($selectedFeatures); + + return $selectedFeatures; + } + /** * @param string $baseValue * @param array $colValues @@ -248,6 +274,27 @@ class DecisionTree implements Classifier return false; } + /** + * This method is used to set number of columns to be used + * when deciding a split at an internal node of the tree.
+ * If the value is given 0, then all features are used (default behaviour), + * otherwise the given value will be used as a maximum for number of columns + * randomly selected for each split operation. + * + * @param int $numFeatures + * @return $this + * @throws Exception + */ + public function setNumFeatures(int $numFeatures) + { + if ($numFeatures < 0) { + throw new \Exception("Selected column count should be greater or equal to zero"); + } + + $this->numUsableFeatures = $numFeatures; + return $this; + } + /** * @return string */ @@ -273,6 +320,7 @@ class DecisionTree implements Classifier $node = $node->rightLeaf; } } while ($node); - return $node->classValue; + + return $node ? $node->classValue : $this->labels[0]; } } diff --git a/src/Phpml/Classification/DecisionTree/DecisionTreeLeaf.php b/src/Phpml/Classification/DecisionTree/DecisionTreeLeaf.php index d428919..1993864 100644 --- a/src/Phpml/Classification/DecisionTree/DecisionTreeLeaf.php +++ b/src/Phpml/Classification/DecisionTree/DecisionTreeLeaf.php @@ -62,7 +62,7 @@ class DecisionTreeLeaf public function evaluate($record) { $recordField = $record[$this->columnIndex]; - if (preg_match("/^([<>=]{1,2})\s*(.*)/", $this->value, $matches)) { + if (is_string($this->value) && preg_match("/^([<>=]{1,2})\s*(.*)/", $this->value, $matches)) { $op = $matches[1]; $value= floatval($matches[2]); $recordField = strval($recordField); diff --git a/src/Phpml/Classification/Ensemble/Bagging.php b/src/Phpml/Classification/Ensemble/Bagging.php new file mode 100644 index 0000000..817869e --- /dev/null +++ b/src/Phpml/Classification/Ensemble/Bagging.php @@ -0,0 +1,198 @@ + 20]; + + /** + * @var array + */ + protected $classifiers; + + /** + * @var float + */ + protected $subsetRatio = 0.5; + + /** + * @var array + */ + private $samples = []; + + /** + * Creates an ensemble classifier with given number of base classifiers
+ * Default number of base classifiers is 100. + * The more number of base classifiers, the better performance but at the cost of procesing time + * + * @param int $numClassifier + */ + public function __construct($numClassifier = 50) + { + $this->numClassifier = $numClassifier; + } + + /** + * This method determines the ratio of samples used to create the 'bootstrap' subset, + * e.g., random samples drawn from the original dataset with replacement (allow repeats), + * to train each base classifier. + * + * @param float $ratio + * @return $this + * @throws Exception + */ + public function setSubsetRatio(float $ratio) + { + if ($ratio < 0.1 || $ratio > 1.0) { + throw new \Exception("Subset ratio should be between 0.1 and 1.0"); + } + $this->subsetRatio = $ratio; + return $this; + } + + /** + * This method is used to set the base classifier. Default value is + * DecisionTree::class, but any class that implements the Classifier + * can be used.
+ * While giving the parameters of the classifier, the values should be + * given in the order they are in the constructor of the classifier and parameter + * names are neglected. + * + * @param string $classifier + * @param array $classifierOptions + * @return $this + */ + public function setClassifer(string $classifier, array $classifierOptions = []) + { + $this->classifier = $classifier; + $this->classifierOptions = $classifierOptions; + return $this; + } + + /** + * @param array $samples + * @param array $targets + */ + public function train(array $samples, array $targets) + { + $this->samples = array_merge($this->samples, $samples); + $this->targets = array_merge($this->targets, $targets); + $this->featureCount = count($samples[0]); + $this->numSamples = count($this->samples); + + // Init classifiers and train them with random sub-samples + $this->classifiers = $this->initClassifiers(); + $index = 0; + foreach ($this->classifiers as $classifier) { + list($samples, $targets) = $this->getRandomSubset($index); + $classifier->train($samples, $targets); + ++$index; + } + } + + /** + * @param int $index + * @return array + */ + protected function getRandomSubset($index) + { + $subsetLength = (int)ceil(sqrt($this->numSamples)); + $denom = $this->subsetRatio / 2; + $subsetLength = $this->numSamples / (1 / $denom); + $index = $index * $subsetLength % $this->numSamples; + $samples = []; + $targets = []; + for ($i=0; $i<$subsetLength * 2; $i++) { + $rand = rand($index, $this->numSamples - 1); + $samples[] = $this->samples[$rand]; + $targets[] = $this->targets[$rand]; + } + return [$samples, $targets]; + } + + /** + * @return array + */ + protected function initClassifiers() + { + $classifiers = []; + for ($i=0; $i<$this->numClassifier; $i++) { + $ref = new \ReflectionClass($this->classifier); + if ($this->classifierOptions) { + $obj = $ref->newInstanceArgs($this->classifierOptions); + } else { + $obj = $ref->newInstance(); + } + $classifiers[] = $this->initSingleClassifier($obj, $i); + } + return $classifiers; + } + + /** + * @param Classifier $classifier + * @param int $index + * @return Classifier + */ + protected function initSingleClassifier($classifier, $index) + { + return $classifier; + } + + /** + * @param array $sample + * @return mixed + */ + protected function predictSample(array $sample) + { + $predictions = []; + foreach ($this->classifiers as $classifier) { + /* @var $classifier Classifier */ + $predictions[] = $classifier->predict($sample); + } + + $counts = array_count_values($predictions); + arsort($counts); + reset($counts); + return key($counts); + } +} diff --git a/src/Phpml/Classification/Ensemble/RandomForest.php b/src/Phpml/Classification/Ensemble/RandomForest.php new file mode 100644 index 0000000..37df7ae --- /dev/null +++ b/src/Phpml/Classification/Ensemble/RandomForest.php @@ -0,0 +1,89 @@ +setSubsetRatio(1.0); + } + + /** + * This method is used to determine how much of the original columns (features) + * will be used to construct subsets to train base classifiers.
+ * + * Allowed values: 'sqrt', 'log' or any float number between 0.1 and 1.0
+ * + * If there are many features that diminishes classification performance, then + * small values should be preferred, otherwise, with low number of features, + * default value (0.7) will result in satisfactory performance. + * + * @param mixed $ratio string or float should be given + * @return $this + * @throws Exception + */ + public function setFeatureSubsetRatio($ratio) + { + if (is_float($ratio) && ($ratio < 0.1 || $ratio > 1.0)) { + throw new \Exception("When a float given, feature subset ratio should be between 0.1 and 1.0"); + } + if (is_string($ratio) && $ratio != 'sqrt' && $ratio != 'log') { + throw new \Exception("When a string given, feature subset ratio can only be 'sqrt' or 'log' "); + } + $this->featureSubsetRatio = $ratio; + return $this; + } + + /** + * RandomForest algorithm is usable *only* with DecisionTree + * + * @param string $classifier + * @param array $classifierOptions + * @return $this + */ + public function setClassifer(string $classifier, array $classifierOptions = []) + { + if ($classifier != DecisionTree::class) { + throw new \Exception("RandomForest can only use DecisionTree as base classifier"); + } + + return parent::setClassifer($classifier, $classifierOptions); + } + + /** + * @param DecisionTree $classifier + * @param int $index + * @return DecisionTree + */ + protected function initSingleClassifier($classifier, $index) + { + if (is_float($this->featureSubsetRatio)) { + $featureCount = (int)($this->featureSubsetRatio * $this->featureCount); + } elseif ($this->featureCount == 'sqrt') { + $featureCount = (int)sqrt($this->featureCount) + 1; + } else { + $featureCount = (int)log($this->featureCount, 2) + 1; + } + + if ($featureCount >= $this->featureCount) { + $featureCount = $this->featureCount; + } + + return $classifier->setNumFeatures($featureCount); + } +} diff --git a/tests/Phpml/Classification/Ensemble/BaggingTest.php b/tests/Phpml/Classification/Ensemble/BaggingTest.php new file mode 100644 index 0000000..e7dfcad --- /dev/null +++ b/tests/Phpml/Classification/Ensemble/BaggingTest.php @@ -0,0 +1,127 @@ +getData($this->data); + $classifier = $this->getClassifier(); + // Testing with default options + $classifier->train($data, $targets); + $this->assertEquals('Dont_play', $classifier->predict(['sunny', 78, 72, 'false'])); + $this->assertEquals('Play', $classifier->predict(['overcast', 60, 60, 'false'])); + $this->assertEquals('Dont_play', $classifier->predict(['rain', 60, 60, 'true'])); + + list($data, $targets) = $this->getData($this->extraData); + $classifier->train($data, $targets); + $this->assertEquals('Dont_play', $classifier->predict(['scorching', 95, 90, 'true'])); + $this->assertEquals('Play', $classifier->predict(['overcast', 60, 60, 'false'])); + + return $classifier; + } + + public function testSaveAndRestore() + { + list($data, $targets) = $this->getData($this->data); + $classifier = $this->getClassifier(5); + $classifier->train($data, $targets); + + $testSamples = [['sunny', 78, 72, 'false'], ['overcast', 60, 60, 'false']]; + $predicted = $classifier->predict($testSamples); + + $filename = 'bagging-test-'.rand(100, 999).'-'.uniqid(); + $filepath = tempnam(sys_get_temp_dir(), $filename); + $modelManager = new ModelManager(); + $modelManager->saveToFile($classifier, $filepath); + + $restoredClassifier = $modelManager->restoreFromFile($filepath); + $this->assertEquals($classifier, $restoredClassifier); + $this->assertEquals($predicted, $restoredClassifier->predict($testSamples)); + } + + public function testBaseClassifiers() + { + list($data, $targets) = $this->getData($this->data); + $baseClassifiers = $this->getAvailableBaseClassifiers(); + + foreach ($baseClassifiers as $base => $params) { + $classifier = $this->getClassifier(); + $classifier->setClassifer($base, $params); + $classifier->train($data, $targets); + + $baseClassifier = new $base(...array_values($params)); + $baseClassifier->train($data, $targets); + $testData = [['sunny', 78, 72, 'false'], ['overcast', 60, 60, 'false'], ['rain', 60, 60, 'true']]; + foreach ($testData as $test) { + $result = $classifier->predict($test); + $baseResult = $classifier->predict($test); + $this->assertEquals($result, $baseResult); + } + } + } + + protected function getClassifier($numBaseClassifiers = 50) + { + $classifier = new Bagging($numBaseClassifiers); + $classifier->setSubsetRatio(1.0); + $classifier->setClassifer(DecisionTree::class, ['depth' => 10]); + return $classifier; + } + + protected function getAvailableBaseClassifiers() + { + return [ + DecisionTree::class => ['depth' => 5], + NaiveBayes::class => [] + ]; + } + + private function getData($input) + { + // Populating input data to a size large enough + // for base classifiers that they can work with a subset of it + $populated = []; + for ($i=0; $i<20; $i++) { + $populated = array_merge($populated, $input); + } + shuffle($populated); + $targets = array_column($populated, 4); + array_walk($populated, function (&$v) { + array_splice($v, 4, 1); + }); + return [$populated, $targets]; + } +} diff --git a/tests/Phpml/Classification/Ensemble/RandomForestTest.php b/tests/Phpml/Classification/Ensemble/RandomForestTest.php new file mode 100644 index 0000000..d32507c --- /dev/null +++ b/tests/Phpml/Classification/Ensemble/RandomForestTest.php @@ -0,0 +1,38 @@ +setFeatureSubsetRatio('log'); + return $classifier; + } + + protected function getAvailableBaseClassifiers() + { + return [ DecisionTree::class => ['depth' => 5] ]; + } + + public function testOtherBaseClassifier() + { + try { + $classifier = new RandomForest(); + $classifier->setClassifer(NaiveBayes::class); + $this->assertEquals(0, 1); + } catch (\Exception $ex) { + $this->assertEquals(1, 1); + } + } +}