diff --git a/src/Phpml/Classification/DecisionTree.php b/src/Phpml/Classification/DecisionTree.php index 6a860eb..c73f870 100644 --- a/src/Phpml/Classification/DecisionTree.php +++ b/src/Phpml/Classification/DecisionTree.php @@ -56,6 +56,11 @@ class DecisionTree implements Classifier */ private $numUsableFeatures = 0; + /** + * @var array + */ + private $selectedFeatures; + /** * @var array */ @@ -126,33 +131,45 @@ class DecisionTree implements Classifier if ($this->actualDepth < $depth) { $this->actualDepth = $depth; } + + // Traverse all records to see if all records belong to the same class, + // otherwise group the records so that we can classify the leaf + // in case maximum depth is reached $leftRecords = []; $rightRecords= []; $remainingTargets = []; $prevRecord = null; $allSame = true; + foreach ($records as $recordNo) { + // Check if the previous record is the same with the current one $record = $this->samples[$recordNo]; if ($prevRecord && $prevRecord != $record) { $allSame = false; } $prevRecord = $record; + + // According to the split criteron, this record will + // belong to either left or the right side in the next split if ($split->evaluate($record)) { $leftRecords[] = $recordNo; } else { $rightRecords[]= $recordNo; } + + // Group remaining targets $target = $this->targets[$recordNo]; - if (! in_array($target, $remainingTargets)) { - $remainingTargets[] = $target; + if (! array_key_exists($target, $remainingTargets)) { + $remainingTargets[$target] = 1; + } else { + $remainingTargets[$target]++; } } if (count($remainingTargets) == 1 || $allSame || $depth >= $this->maxDepth) { $split->isTerminal = 1; - $classes = array_count_values($remainingTargets); - arsort($classes); - $split->classValue = key($classes); + arsort($remainingTargets); + $split->classValue = key($remainingTargets); } else { if ($leftRecords) { $split->leftLeaf = $this->getSplitLeaf($leftRecords, $depth + 1); @@ -200,15 +217,31 @@ class DecisionTree implements Classifier } /** + * Returns available features/columns to the tree for the decision making + * process.
+ * + * If a number is given with setNumFeatures() method, then a random selection + * of features up to this number is returned.
+ * + * If some features are manually selected by use of setSelectedFeatures(), + * then only these features are returned
+ * + * If any of above methods were not called beforehand, then all features + * are returned by default. + * * @return array */ protected function getSelectedFeatures() { $allFeatures = range(0, $this->featureCount - 1); - if ($this->numUsableFeatures == 0) { + if ($this->numUsableFeatures == 0 && ! $this->selectedFeatures) { return $allFeatures; } + if ($this->selectedFeatures) { + return $this->selectedFeatures; + } + $numFeatures = $this->numUsableFeatures; if ($numFeatures > $this->featureCount) { $numFeatures = $this->featureCount; @@ -323,6 +356,16 @@ class DecisionTree implements Classifier return $this; } + /** + * Used to set predefined features to consider while deciding which column to use for a split, + * + * @param array $features + */ + protected function setSelectedFeatures(array $selectedFeatures) + { + $this->selectedFeatures = $selectedFeatures; + } + /** * A string array to represent columns. Useful when HTML output or * column importances are desired to be inspected. diff --git a/src/Phpml/Classification/Linear/Adaline.php b/src/Phpml/Classification/Linear/Adaline.php new file mode 100644 index 0000000..94283d9 --- /dev/null +++ b/src/Phpml/Classification/Linear/Adaline.php @@ -0,0 +1,148 @@ + + * + * Learning rate should be a float value between 0.0(exclusive) and 1.0 (inclusive)
+ * Maximum number of iterations can be an integer value greater than 0
+ * If normalizeInputs is set to true, then every input given to the algorithm will be standardized + * by use of standard deviation and mean calculation + * + * @param int $learningRate + * @param int $maxIterations + */ + public function __construct(float $learningRate = 0.001, int $maxIterations = 1000, + bool $normalizeInputs = true, int $trainingType = self::BATCH_TRAINING) + { + if ($normalizeInputs) { + $this->normalizer = new Normalizer(Normalizer::NORM_STD); + } + + if (! in_array($trainingType, [self::BATCH_TRAINING, self::ONLINE_TRAINING])) { + throw new \Exception("Adaline can only be trained with batch and online/stochastic gradient descent algorithm"); + } + $this->trainingType = $trainingType; + + parent::__construct($learningRate, $maxIterations); + } + + /** + * @param array $samples + * @param array $targets + */ + public function train(array $samples, array $targets) + { + if ($this->normalizer) { + $this->normalizer->transform($samples); + } + + parent::train($samples, $targets); + } + + /** + * Adapts the weights with respect to given samples and targets + * by use of gradient descent learning rule + */ + protected function runTraining() + { + // If online training is chosen, then the parent runTraining method + // will be executed with the 'output' method as the error function + if ($this->trainingType == self::ONLINE_TRAINING) { + return parent::runTraining(); + } + + // Batch learning is executed: + $currIter = 0; + while ($this->maxIterations > $currIter++) { + $outputs = array_map([$this, 'output'], $this->samples); + $updates = array_map([$this, 'gradient'], $this->targets, $outputs); + $sum = array_sum($updates); + + // Updates all weights at once + for ($i=0; $i <= $this->featureCount; $i++) { + if ($i == 0) { + $this->weights[0] += $this->learningRate * $sum; + } else { + $col = array_column($this->samples, $i - 1); + $error = 0; + foreach ($col as $index => $val) { + $error += $val * $updates[$index]; + } + + $this->weights[$i] += $this->learningRate * $error; + } + } + } + } + + /** + * Returns the direction of gradient given the desired and actual outputs + * + * @param int $desired + * @param int $output + * @return int + */ + protected function gradient($desired, $output) + { + return $desired - $output; + } + + /** + * @param array $sample + * @return mixed + */ + public function predictSample(array $sample) + { + if ($this->normalizer) { + $samples = [$sample]; + $this->normalizer->transform($samples); + $sample = $samples[0]; + } + + return parent::predictSample($sample); + } +} diff --git a/src/Phpml/Classification/Linear/DecisionStump.php b/src/Phpml/Classification/Linear/DecisionStump.php new file mode 100644 index 0000000..18d4449 --- /dev/null +++ b/src/Phpml/Classification/Linear/DecisionStump.php @@ -0,0 +1,56 @@ + + * + * If columnIndex is given, then the stump tries to produce a decision node + * on this column, otherwise in cases given the value of -1, the stump itself + * decides which column to take for the decision (Default DecisionTree behaviour) + * + * @param int $columnIndex + */ + public function __construct(int $columnIndex = -1) + { + $this->columnIndex = $columnIndex; + + parent::__construct(1); + } + + /** + * @param array $samples + * @param array $targets + */ + public function train(array $samples, array $targets) + { + // Check if a column index was given + if ($this->columnIndex >= 0 && $this->columnIndex > count($samples[0]) - 1) { + $this->columnIndex = -1; + } + + if ($this->columnIndex >= 0) { + $this->setSelectedFeatures([$this->columnIndex]); + } + + parent::train($samples, $targets); + } +} diff --git a/src/Phpml/Classification/Linear/Perceptron.php b/src/Phpml/Classification/Linear/Perceptron.php new file mode 100644 index 0000000..963638e --- /dev/null +++ b/src/Phpml/Classification/Linear/Perceptron.php @@ -0,0 +1,174 @@ + + * + * Learning rate should be a float value between 0.0(exclusive) and 1.0(inclusive)
+ * Maximum number of iterations can be an integer value greater than 0 + * @param int $learningRate + * @param int $maxIterations + */ + public function __construct(float $learningRate = 0.001, int $maxIterations = 1000) + { + if ($learningRate <= 0.0 || $learningRate > 1.0) { + throw new \Exception("Learning rate should be a float value between 0.0(exclusive) and 1.0(inclusive)"); + } + + if ($maxIterations <= 0) { + throw new \Exception("Maximum number of iterations should be an integer greater than 0"); + } + + $this->learningRate = $learningRate; + $this->maxIterations = $maxIterations; + } + + /** + * @param array $samples + * @param array $targets + */ + public function train(array $samples, array $targets) + { + $this->labels = array_keys(array_count_values($targets)); + if (count($this->labels) > 2) { + throw new \Exception("Perceptron is for only binary (two-class) classification"); + } + + // Set all target values to either -1 or 1 + $this->labels = [1 => $this->labels[0], -1 => $this->labels[1]]; + foreach ($targets as $target) { + $this->targets[] = $target == $this->labels[1] ? 1 : -1; + } + + // Set samples and feature count vars + $this->samples = array_merge($this->samples, $samples); + $this->featureCount = count($this->samples[0]); + + // Init weights with random values + $this->weights = array_fill(0, $this->featureCount + 1, 0); + foreach ($this->weights as &$weight) { + $weight = rand() / (float) getrandmax(); + } + // Do training + $this->runTraining(); + } + + /** + * Adapts the weights with respect to given samples and targets + * by use of perceptron learning rule + */ + protected function runTraining() + { + $currIter = 0; + while ($this->maxIterations > $currIter++) { + foreach ($this->samples as $index => $sample) { + $target = $this->targets[$index]; + $prediction = $this->{static::$errorFunction}($sample); + $update = $target - $prediction; + // Update bias + $this->weights[0] += $update * $this->learningRate; // Bias + // Update other weights + for ($i=1; $i <= $this->featureCount; $i++) { + $this->weights[$i] += $update * $sample[$i - 1] * $this->learningRate; + } + } + } + } + + /** + * Calculates net output of the network as a float value for the given input + * + * @param array $sample + * @return int + */ + protected function output(array $sample) + { + $sum = 0; + foreach ($this->weights as $index => $w) { + if ($index == 0) { + $sum += $w; + } else { + $sum += $w * $sample[$index - 1]; + } + } + + return $sum; + } + + /** + * Returns the class value (either -1 or 1) for the given input + * + * @param array $sample + * @return int + */ + protected function outputClass(array $sample) + { + return $this->output($sample) > 0 ? 1 : -1; + } + + /** + * @param array $sample + * @return mixed + */ + protected function predictSample(array $sample) + { + $predictedClass = $this->outputClass($sample); + + return $this->labels[ $predictedClass ]; + } +} diff --git a/src/Phpml/Preprocessing/Normalizer.php b/src/Phpml/Preprocessing/Normalizer.php index 5cff6e8..42a8f1c 100644 --- a/src/Phpml/Preprocessing/Normalizer.php +++ b/src/Phpml/Preprocessing/Normalizer.php @@ -5,17 +5,35 @@ declare(strict_types=1); namespace Phpml\Preprocessing; use Phpml\Exception\NormalizerException; +use Phpml\Math\Statistic\StandardDeviation; +use Phpml\Math\Statistic\Mean; class Normalizer implements Preprocessor { const NORM_L1 = 1; const NORM_L2 = 2; + const NORM_STD= 3; /** * @var int */ private $norm; + /** + * @var bool + */ + private $fitted = false; + + /** + * @var array + */ + private $std; + + /** + * @var array + */ + private $mean; + /** * @param int $norm * @@ -23,7 +41,7 @@ class Normalizer implements Preprocessor */ public function __construct(int $norm = self::NORM_L2) { - if (!in_array($norm, [self::NORM_L1, self::NORM_L2])) { + if (!in_array($norm, [self::NORM_L1, self::NORM_L2, self::NORM_STD])) { throw NormalizerException::unknownNorm(); } @@ -35,7 +53,20 @@ class Normalizer implements Preprocessor */ public function fit(array $samples) { - // intentionally not implemented + if ($this->fitted) { + return; + } + + if ($this->norm == self::NORM_STD) { + $features = range(0, count($samples[0]) - 1); + foreach ($features as $i) { + $values = array_column($samples, $i); + $this->std[$i] = StandardDeviation::population($values); + $this->mean[$i] = Mean::arithmetic($values); + } + } + + $this->fitted = true; } /** @@ -43,7 +74,15 @@ class Normalizer implements Preprocessor */ public function transform(array &$samples) { - $method = sprintf('normalizeL%s', $this->norm); + $methods = [ + self::NORM_L1 => 'normalizeL1', + self::NORM_L2 => 'normalizeL2', + self::NORM_STD=> 'normalizeSTD' + ]; + $method = $methods[$this->norm]; + + $this->fit($samples); + foreach ($samples as &$sample) { $this->$method($sample); } @@ -88,4 +127,14 @@ class Normalizer implements Preprocessor } } } + + /** + * @param array $sample + */ + private function normalizeSTD(array &$sample) + { + foreach ($sample as $i => $val) { + $sample[$i] = ($sample[$i] - $this->mean[$i]) / $this->std[$i]; + } + } } diff --git a/tests/Phpml/Classification/Linear/AdalineTest.php b/tests/Phpml/Classification/Linear/AdalineTest.php new file mode 100644 index 0000000..7ea63ab --- /dev/null +++ b/tests/Phpml/Classification/Linear/AdalineTest.php @@ -0,0 +1,55 @@ +train($samples, $targets); + $this->assertEquals(0, $classifier->predict([0.1, 0.2])); + $this->assertEquals(0, $classifier->predict([0.1, 0.99])); + $this->assertEquals(1, $classifier->predict([1.1, 0.8])); + + // OR problem + $samples = [[0, 0], [1, 0], [0, 1], [1, 1]]; + $targets = [0, 1, 1, 1]; + $classifier = new Adaline(); + $classifier->train($samples, $targets); + $this->assertEquals(0, $classifier->predict([0.1, 0.2])); + $this->assertEquals(1, $classifier->predict([0.1, 0.99])); + $this->assertEquals(1, $classifier->predict([1.1, 0.8])); + + return $classifier; + } + + public function testSaveAndRestore() + { + // Instantinate new Percetron trained for OR problem + $samples = [[0, 0], [1, 0], [0, 1], [1, 1]]; + $targets = [0, 1, 1, 1]; + $classifier = new Adaline(); + $classifier->train($samples, $targets); + $testSamples = [[0, 1], [1, 1], [0.2, 0.1]]; + $predicted = $classifier->predict($testSamples); + + $filename = 'adaline-test-'.rand(100, 999).'-'.uniqid(); + $filepath = tempnam(sys_get_temp_dir(), $filename); + $modelManager = new ModelManager(); + $modelManager->saveToFile($classifier, $filepath); + + $restoredClassifier = $modelManager->restoreFromFile($filepath); + $this->assertEquals($classifier, $restoredClassifier); + $this->assertEquals($predicted, $restoredClassifier->predict($testSamples)); + } +} diff --git a/tests/Phpml/Classification/Linear/DecisionStumpTest.php b/tests/Phpml/Classification/Linear/DecisionStumpTest.php new file mode 100644 index 0000000..f83e095 --- /dev/null +++ b/tests/Phpml/Classification/Linear/DecisionStumpTest.php @@ -0,0 +1,59 @@ +train($samples, $targets); + $this->assertEquals(0, $classifier->predict([0.1, 0.2])); + $this->assertEquals(0, $classifier->predict([1.1, 0.2])); + $this->assertEquals(1, $classifier->predict([0.1, 0.99])); + $this->assertEquals(1, $classifier->predict([1.1, 0.8])); + + // Then: vertical test + $samples = [[0, 0], [1, 0], [0, 1], [1, 1]]; + $targets = [0, 1, 0, 1]; + $classifier = new DecisionStump(); + $classifier->train($samples, $targets); + $this->assertEquals(0, $classifier->predict([0.1, 0.2])); + $this->assertEquals(0, $classifier->predict([0.1, 1.1])); + $this->assertEquals(1, $classifier->predict([1.0, 0.99])); + $this->assertEquals(1, $classifier->predict([1.1, 0.1])); + + return $classifier; + } + + public function testSaveAndRestore() + { + // Instantinate new Percetron trained for OR problem + $samples = [[0, 0], [1, 0], [0, 1], [1, 1]]; + $targets = [0, 1, 1, 1]; + $classifier = new DecisionStump(); + $classifier->train($samples, $targets); + $testSamples = [[0, 1], [1, 1], [0.2, 0.1]]; + $predicted = $classifier->predict($testSamples); + + $filename = 'dstump-test-'.rand(100, 999).'-'.uniqid(); + $filepath = tempnam(sys_get_temp_dir(), $filename); + $modelManager = new ModelManager(); + $modelManager->saveToFile($classifier, $filepath); + + $restoredClassifier = $modelManager->restoreFromFile($filepath); + $this->assertEquals($classifier, $restoredClassifier); + $this->assertEquals($predicted, $restoredClassifier->predict($testSamples)); + } +} diff --git a/tests/Phpml/Classification/Linear/PerceptronTest.php b/tests/Phpml/Classification/Linear/PerceptronTest.php new file mode 100644 index 0000000..bf1b384 --- /dev/null +++ b/tests/Phpml/Classification/Linear/PerceptronTest.php @@ -0,0 +1,55 @@ +train($samples, $targets); + $this->assertEquals(0, $classifier->predict([0.1, 0.2])); + $this->assertEquals(0, $classifier->predict([0.1, 0.99])); + $this->assertEquals(1, $classifier->predict([1.1, 0.8])); + + // OR problem + $samples = [[0, 0], [0.1, 0.2], [1, 0], [0, 1], [1, 1]]; + $targets = [0, 0, 1, 1, 1]; + $classifier = new Perceptron(0.001, 5000); + $classifier->train($samples, $targets); + $this->assertEquals(0, $classifier->predict([0, 0])); + $this->assertEquals(1, $classifier->predict([0.1, 0.99])); + $this->assertEquals(1, $classifier->predict([1.1, 0.8])); + + return $classifier; + } + + public function testSaveAndRestore() + { + // Instantinate new Percetron trained for OR problem + $samples = [[0, 0], [1, 0], [0, 1], [1, 1]]; + $targets = [0, 1, 1, 1]; + $classifier = new Perceptron(); + $classifier->train($samples, $targets); + $testSamples = [[0, 1], [1, 1], [0.2, 0.1]]; + $predicted = $classifier->predict($testSamples); + + $filename = 'perceptron-test-'.rand(100, 999).'-'.uniqid(); + $filepath = tempnam(sys_get_temp_dir(), $filename); + $modelManager = new ModelManager(); + $modelManager->saveToFile($classifier, $filepath); + + $restoredClassifier = $modelManager->restoreFromFile($filepath); + $this->assertEquals($classifier, $restoredClassifier); + $this->assertEquals($predicted, $restoredClassifier->predict($testSamples)); + } +} diff --git a/tests/Phpml/Preprocessing/NormalizerTest.php b/tests/Phpml/Preprocessing/NormalizerTest.php index 99ebf4e..07d121c 100644 --- a/tests/Phpml/Preprocessing/NormalizerTest.php +++ b/tests/Phpml/Preprocessing/NormalizerTest.php @@ -100,4 +100,32 @@ class NormalizerTest extends TestCase $this->assertEquals($normalized, $samples, '', $delta = 0.01); } + + public function testStandardNorm() + { + // Generate 10 random vectors of length 3 + $samples = []; + srand(time()); + for ($i=0; $i<10; $i++) { + $sample = array_fill(0, 3, 0); + for ($k=0; $k<3; $k++) { + $sample[$k] = rand(1, 100); + } + $samples[] = $sample; + } + + // Use standard normalization + $normalizer = new Normalizer(Normalizer::NORM_STD); + $normalizer->transform($samples); + + // Values in the vector should be some value between -3 and +3 + $this->assertCount(10, $samples); + foreach ($samples as $sample) { + $errors = array_filter($sample, + function ($element) { + return $element < -3 || $element > 3; + }); + $this->assertCount(0, $errors); + } + } }