diff --git a/src/Phpml/Classification/DecisionTree.php b/src/Phpml/Classification/DecisionTree.php
index 1a39cbe..1a04802 100644
--- a/src/Phpml/Classification/DecisionTree.php
+++ b/src/Phpml/Classification/DecisionTree.php
@@ -51,6 +51,11 @@ class DecisionTree implements Classifier
*/
public $actualDepth = 0;
+ /**
+ * @var int
+ */
+ private $numUsableFeatures = 0;
+
/**
* @param int $maxDepth
*/
@@ -144,15 +149,15 @@ class DecisionTree implements Classifier
$samples = array_combine($records, $this->preprocess($samples));
$bestGiniVal = 1;
$bestSplit = null;
- for ($i=0; $i<$this->featureCount; $i++) {
+ $features = $this->getSelectedFeatures();
+ foreach ($features as $i) {
$colValues = [];
- $baseValue = null;
foreach ($samples as $index => $row) {
$colValues[$index] = $row[$i];
- if ($baseValue === null) {
- $baseValue = $row[$i];
- }
}
+ $counts = array_count_values($colValues);
+ arsort($counts);
+ $baseValue = key($counts);
$gini = $this->getGiniIndex($baseValue, $colValues, $targets);
if ($bestSplit == null || $bestGiniVal > $gini) {
$split = new DecisionTreeLeaf();
@@ -167,6 +172,27 @@ class DecisionTree implements Classifier
return $bestSplit;
}
+ /**
+ * @return array
+ */
+ protected function getSelectedFeatures()
+ {
+ $allFeatures = range(0, $this->featureCount - 1);
+ if ($this->numUsableFeatures == 0) {
+ return $allFeatures;
+ }
+
+ $numFeatures = $this->numUsableFeatures;
+ if ($numFeatures > $this->featureCount) {
+ $numFeatures = $this->featureCount;
+ }
+ shuffle($allFeatures);
+ $selectedFeatures = array_slice($allFeatures, 0, $numFeatures, false);
+ sort($selectedFeatures);
+
+ return $selectedFeatures;
+ }
+
/**
* @param string $baseValue
* @param array $colValues
@@ -248,6 +274,27 @@ class DecisionTree implements Classifier
return false;
}
+ /**
+ * This method is used to set number of columns to be used
+ * when deciding a split at an internal node of the tree.
+ * If the value is given 0, then all features are used (default behaviour),
+ * otherwise the given value will be used as a maximum for number of columns
+ * randomly selected for each split operation.
+ *
+ * @param int $numFeatures
+ * @return $this
+ * @throws Exception
+ */
+ public function setNumFeatures(int $numFeatures)
+ {
+ if ($numFeatures < 0) {
+ throw new \Exception("Selected column count should be greater or equal to zero");
+ }
+
+ $this->numUsableFeatures = $numFeatures;
+ return $this;
+ }
+
/**
* @return string
*/
@@ -273,6 +320,7 @@ class DecisionTree implements Classifier
$node = $node->rightLeaf;
}
} while ($node);
- return $node->classValue;
+
+ return $node ? $node->classValue : $this->labels[0];
}
}
diff --git a/src/Phpml/Classification/DecisionTree/DecisionTreeLeaf.php b/src/Phpml/Classification/DecisionTree/DecisionTreeLeaf.php
index d428919..1993864 100644
--- a/src/Phpml/Classification/DecisionTree/DecisionTreeLeaf.php
+++ b/src/Phpml/Classification/DecisionTree/DecisionTreeLeaf.php
@@ -62,7 +62,7 @@ class DecisionTreeLeaf
public function evaluate($record)
{
$recordField = $record[$this->columnIndex];
- if (preg_match("/^([<>=]{1,2})\s*(.*)/", $this->value, $matches)) {
+ if (is_string($this->value) && preg_match("/^([<>=]{1,2})\s*(.*)/", $this->value, $matches)) {
$op = $matches[1];
$value= floatval($matches[2]);
$recordField = strval($recordField);
diff --git a/src/Phpml/Classification/Ensemble/Bagging.php b/src/Phpml/Classification/Ensemble/Bagging.php
new file mode 100644
index 0000000..817869e
--- /dev/null
+++ b/src/Phpml/Classification/Ensemble/Bagging.php
@@ -0,0 +1,198 @@
+ 20];
+
+ /**
+ * @var array
+ */
+ protected $classifiers;
+
+ /**
+ * @var float
+ */
+ protected $subsetRatio = 0.5;
+
+ /**
+ * @var array
+ */
+ private $samples = [];
+
+ /**
+ * Creates an ensemble classifier with given number of base classifiers
+ * Default number of base classifiers is 100.
+ * The more number of base classifiers, the better performance but at the cost of procesing time
+ *
+ * @param int $numClassifier
+ */
+ public function __construct($numClassifier = 50)
+ {
+ $this->numClassifier = $numClassifier;
+ }
+
+ /**
+ * This method determines the ratio of samples used to create the 'bootstrap' subset,
+ * e.g., random samples drawn from the original dataset with replacement (allow repeats),
+ * to train each base classifier.
+ *
+ * @param float $ratio
+ * @return $this
+ * @throws Exception
+ */
+ public function setSubsetRatio(float $ratio)
+ {
+ if ($ratio < 0.1 || $ratio > 1.0) {
+ throw new \Exception("Subset ratio should be between 0.1 and 1.0");
+ }
+ $this->subsetRatio = $ratio;
+ return $this;
+ }
+
+ /**
+ * This method is used to set the base classifier. Default value is
+ * DecisionTree::class, but any class that implements the Classifier
+ * can be used.
+ * While giving the parameters of the classifier, the values should be
+ * given in the order they are in the constructor of the classifier and parameter
+ * names are neglected.
+ *
+ * @param string $classifier
+ * @param array $classifierOptions
+ * @return $this
+ */
+ public function setClassifer(string $classifier, array $classifierOptions = [])
+ {
+ $this->classifier = $classifier;
+ $this->classifierOptions = $classifierOptions;
+ return $this;
+ }
+
+ /**
+ * @param array $samples
+ * @param array $targets
+ */
+ public function train(array $samples, array $targets)
+ {
+ $this->samples = array_merge($this->samples, $samples);
+ $this->targets = array_merge($this->targets, $targets);
+ $this->featureCount = count($samples[0]);
+ $this->numSamples = count($this->samples);
+
+ // Init classifiers and train them with random sub-samples
+ $this->classifiers = $this->initClassifiers();
+ $index = 0;
+ foreach ($this->classifiers as $classifier) {
+ list($samples, $targets) = $this->getRandomSubset($index);
+ $classifier->train($samples, $targets);
+ ++$index;
+ }
+ }
+
+ /**
+ * @param int $index
+ * @return array
+ */
+ protected function getRandomSubset($index)
+ {
+ $subsetLength = (int)ceil(sqrt($this->numSamples));
+ $denom = $this->subsetRatio / 2;
+ $subsetLength = $this->numSamples / (1 / $denom);
+ $index = $index * $subsetLength % $this->numSamples;
+ $samples = [];
+ $targets = [];
+ for ($i=0; $i<$subsetLength * 2; $i++) {
+ $rand = rand($index, $this->numSamples - 1);
+ $samples[] = $this->samples[$rand];
+ $targets[] = $this->targets[$rand];
+ }
+ return [$samples, $targets];
+ }
+
+ /**
+ * @return array
+ */
+ protected function initClassifiers()
+ {
+ $classifiers = [];
+ for ($i=0; $i<$this->numClassifier; $i++) {
+ $ref = new \ReflectionClass($this->classifier);
+ if ($this->classifierOptions) {
+ $obj = $ref->newInstanceArgs($this->classifierOptions);
+ } else {
+ $obj = $ref->newInstance();
+ }
+ $classifiers[] = $this->initSingleClassifier($obj, $i);
+ }
+ return $classifiers;
+ }
+
+ /**
+ * @param Classifier $classifier
+ * @param int $index
+ * @return Classifier
+ */
+ protected function initSingleClassifier($classifier, $index)
+ {
+ return $classifier;
+ }
+
+ /**
+ * @param array $sample
+ * @return mixed
+ */
+ protected function predictSample(array $sample)
+ {
+ $predictions = [];
+ foreach ($this->classifiers as $classifier) {
+ /* @var $classifier Classifier */
+ $predictions[] = $classifier->predict($sample);
+ }
+
+ $counts = array_count_values($predictions);
+ arsort($counts);
+ reset($counts);
+ return key($counts);
+ }
+}
diff --git a/src/Phpml/Classification/Ensemble/RandomForest.php b/src/Phpml/Classification/Ensemble/RandomForest.php
new file mode 100644
index 0000000..37df7ae
--- /dev/null
+++ b/src/Phpml/Classification/Ensemble/RandomForest.php
@@ -0,0 +1,89 @@
+setSubsetRatio(1.0);
+ }
+
+ /**
+ * This method is used to determine how much of the original columns (features)
+ * will be used to construct subsets to train base classifiers.
+ *
+ * Allowed values: 'sqrt', 'log' or any float number between 0.1 and 1.0
+ *
+ * If there are many features that diminishes classification performance, then
+ * small values should be preferred, otherwise, with low number of features,
+ * default value (0.7) will result in satisfactory performance.
+ *
+ * @param mixed $ratio string or float should be given
+ * @return $this
+ * @throws Exception
+ */
+ public function setFeatureSubsetRatio($ratio)
+ {
+ if (is_float($ratio) && ($ratio < 0.1 || $ratio > 1.0)) {
+ throw new \Exception("When a float given, feature subset ratio should be between 0.1 and 1.0");
+ }
+ if (is_string($ratio) && $ratio != 'sqrt' && $ratio != 'log') {
+ throw new \Exception("When a string given, feature subset ratio can only be 'sqrt' or 'log' ");
+ }
+ $this->featureSubsetRatio = $ratio;
+ return $this;
+ }
+
+ /**
+ * RandomForest algorithm is usable *only* with DecisionTree
+ *
+ * @param string $classifier
+ * @param array $classifierOptions
+ * @return $this
+ */
+ public function setClassifer(string $classifier, array $classifierOptions = [])
+ {
+ if ($classifier != DecisionTree::class) {
+ throw new \Exception("RandomForest can only use DecisionTree as base classifier");
+ }
+
+ return parent::setClassifer($classifier, $classifierOptions);
+ }
+
+ /**
+ * @param DecisionTree $classifier
+ * @param int $index
+ * @return DecisionTree
+ */
+ protected function initSingleClassifier($classifier, $index)
+ {
+ if (is_float($this->featureSubsetRatio)) {
+ $featureCount = (int)($this->featureSubsetRatio * $this->featureCount);
+ } elseif ($this->featureCount == 'sqrt') {
+ $featureCount = (int)sqrt($this->featureCount) + 1;
+ } else {
+ $featureCount = (int)log($this->featureCount, 2) + 1;
+ }
+
+ if ($featureCount >= $this->featureCount) {
+ $featureCount = $this->featureCount;
+ }
+
+ return $classifier->setNumFeatures($featureCount);
+ }
+}
diff --git a/tests/Phpml/Classification/Ensemble/BaggingTest.php b/tests/Phpml/Classification/Ensemble/BaggingTest.php
new file mode 100644
index 0000000..e7dfcad
--- /dev/null
+++ b/tests/Phpml/Classification/Ensemble/BaggingTest.php
@@ -0,0 +1,127 @@
+getData($this->data);
+ $classifier = $this->getClassifier();
+ // Testing with default options
+ $classifier->train($data, $targets);
+ $this->assertEquals('Dont_play', $classifier->predict(['sunny', 78, 72, 'false']));
+ $this->assertEquals('Play', $classifier->predict(['overcast', 60, 60, 'false']));
+ $this->assertEquals('Dont_play', $classifier->predict(['rain', 60, 60, 'true']));
+
+ list($data, $targets) = $this->getData($this->extraData);
+ $classifier->train($data, $targets);
+ $this->assertEquals('Dont_play', $classifier->predict(['scorching', 95, 90, 'true']));
+ $this->assertEquals('Play', $classifier->predict(['overcast', 60, 60, 'false']));
+
+ return $classifier;
+ }
+
+ public function testSaveAndRestore()
+ {
+ list($data, $targets) = $this->getData($this->data);
+ $classifier = $this->getClassifier(5);
+ $classifier->train($data, $targets);
+
+ $testSamples = [['sunny', 78, 72, 'false'], ['overcast', 60, 60, 'false']];
+ $predicted = $classifier->predict($testSamples);
+
+ $filename = 'bagging-test-'.rand(100, 999).'-'.uniqid();
+ $filepath = tempnam(sys_get_temp_dir(), $filename);
+ $modelManager = new ModelManager();
+ $modelManager->saveToFile($classifier, $filepath);
+
+ $restoredClassifier = $modelManager->restoreFromFile($filepath);
+ $this->assertEquals($classifier, $restoredClassifier);
+ $this->assertEquals($predicted, $restoredClassifier->predict($testSamples));
+ }
+
+ public function testBaseClassifiers()
+ {
+ list($data, $targets) = $this->getData($this->data);
+ $baseClassifiers = $this->getAvailableBaseClassifiers();
+
+ foreach ($baseClassifiers as $base => $params) {
+ $classifier = $this->getClassifier();
+ $classifier->setClassifer($base, $params);
+ $classifier->train($data, $targets);
+
+ $baseClassifier = new $base(...array_values($params));
+ $baseClassifier->train($data, $targets);
+ $testData = [['sunny', 78, 72, 'false'], ['overcast', 60, 60, 'false'], ['rain', 60, 60, 'true']];
+ foreach ($testData as $test) {
+ $result = $classifier->predict($test);
+ $baseResult = $classifier->predict($test);
+ $this->assertEquals($result, $baseResult);
+ }
+ }
+ }
+
+ protected function getClassifier($numBaseClassifiers = 50)
+ {
+ $classifier = new Bagging($numBaseClassifiers);
+ $classifier->setSubsetRatio(1.0);
+ $classifier->setClassifer(DecisionTree::class, ['depth' => 10]);
+ return $classifier;
+ }
+
+ protected function getAvailableBaseClassifiers()
+ {
+ return [
+ DecisionTree::class => ['depth' => 5],
+ NaiveBayes::class => []
+ ];
+ }
+
+ private function getData($input)
+ {
+ // Populating input data to a size large enough
+ // for base classifiers that they can work with a subset of it
+ $populated = [];
+ for ($i=0; $i<20; $i++) {
+ $populated = array_merge($populated, $input);
+ }
+ shuffle($populated);
+ $targets = array_column($populated, 4);
+ array_walk($populated, function (&$v) {
+ array_splice($v, 4, 1);
+ });
+ return [$populated, $targets];
+ }
+}
diff --git a/tests/Phpml/Classification/Ensemble/RandomForestTest.php b/tests/Phpml/Classification/Ensemble/RandomForestTest.php
new file mode 100644
index 0000000..d32507c
--- /dev/null
+++ b/tests/Phpml/Classification/Ensemble/RandomForestTest.php
@@ -0,0 +1,38 @@
+setFeatureSubsetRatio('log');
+ return $classifier;
+ }
+
+ protected function getAvailableBaseClassifiers()
+ {
+ return [ DecisionTree::class => ['depth' => 5] ];
+ }
+
+ public function testOtherBaseClassifier()
+ {
+ try {
+ $classifier = new RandomForest();
+ $classifier->setClassifer(NaiveBayes::class);
+ $this->assertEquals(0, 1);
+ } catch (\Exception $ex) {
+ $this->assertEquals(1, 1);
+ }
+ }
+}