Linear classifiers: Perceptron, Adaline, DecisionStump (#50)

* Linear classifiers

* Code formatting to PSR-2

* Added basic test cases for linear classifiers
This commit is contained in:
Mustafa Karabulut 2017-02-17 01:23:55 +03:00 committed by Arkadiusz Kondas
parent f0a7984f39
commit cf222bcce4
9 changed files with 676 additions and 9 deletions

View File

@ -56,6 +56,11 @@ class DecisionTree implements Classifier
*/ */
private $numUsableFeatures = 0; private $numUsableFeatures = 0;
/**
* @var array
*/
private $selectedFeatures;
/** /**
* @var array * @var array
*/ */
@ -126,33 +131,45 @@ class DecisionTree implements Classifier
if ($this->actualDepth < $depth) { if ($this->actualDepth < $depth) {
$this->actualDepth = $depth; $this->actualDepth = $depth;
} }
// Traverse all records to see if all records belong to the same class,
// otherwise group the records so that we can classify the leaf
// in case maximum depth is reached
$leftRecords = []; $leftRecords = [];
$rightRecords= []; $rightRecords= [];
$remainingTargets = []; $remainingTargets = [];
$prevRecord = null; $prevRecord = null;
$allSame = true; $allSame = true;
foreach ($records as $recordNo) { foreach ($records as $recordNo) {
// Check if the previous record is the same with the current one
$record = $this->samples[$recordNo]; $record = $this->samples[$recordNo];
if ($prevRecord && $prevRecord != $record) { if ($prevRecord && $prevRecord != $record) {
$allSame = false; $allSame = false;
} }
$prevRecord = $record; $prevRecord = $record;
// According to the split criteron, this record will
// belong to either left or the right side in the next split
if ($split->evaluate($record)) { if ($split->evaluate($record)) {
$leftRecords[] = $recordNo; $leftRecords[] = $recordNo;
} else { } else {
$rightRecords[]= $recordNo; $rightRecords[]= $recordNo;
} }
// Group remaining targets
$target = $this->targets[$recordNo]; $target = $this->targets[$recordNo];
if (! in_array($target, $remainingTargets)) { if (! array_key_exists($target, $remainingTargets)) {
$remainingTargets[] = $target; $remainingTargets[$target] = 1;
} else {
$remainingTargets[$target]++;
} }
} }
if (count($remainingTargets) == 1 || $allSame || $depth >= $this->maxDepth) { if (count($remainingTargets) == 1 || $allSame || $depth >= $this->maxDepth) {
$split->isTerminal = 1; $split->isTerminal = 1;
$classes = array_count_values($remainingTargets); arsort($remainingTargets);
arsort($classes); $split->classValue = key($remainingTargets);
$split->classValue = key($classes);
} else { } else {
if ($leftRecords) { if ($leftRecords) {
$split->leftLeaf = $this->getSplitLeaf($leftRecords, $depth + 1); $split->leftLeaf = $this->getSplitLeaf($leftRecords, $depth + 1);
@ -200,15 +217,31 @@ class DecisionTree implements Classifier
} }
/** /**
* Returns available features/columns to the tree for the decision making
* process. <br>
*
* If a number is given with setNumFeatures() method, then a random selection
* of features up to this number is returned. <br>
*
* If some features are manually selected by use of setSelectedFeatures(),
* then only these features are returned <br>
*
* If any of above methods were not called beforehand, then all features
* are returned by default.
*
* @return array * @return array
*/ */
protected function getSelectedFeatures() protected function getSelectedFeatures()
{ {
$allFeatures = range(0, $this->featureCount - 1); $allFeatures = range(0, $this->featureCount - 1);
if ($this->numUsableFeatures == 0) { if ($this->numUsableFeatures == 0 && ! $this->selectedFeatures) {
return $allFeatures; return $allFeatures;
} }
if ($this->selectedFeatures) {
return $this->selectedFeatures;
}
$numFeatures = $this->numUsableFeatures; $numFeatures = $this->numUsableFeatures;
if ($numFeatures > $this->featureCount) { if ($numFeatures > $this->featureCount) {
$numFeatures = $this->featureCount; $numFeatures = $this->featureCount;
@ -323,6 +356,16 @@ class DecisionTree implements Classifier
return $this; return $this;
} }
/**
* Used to set predefined features to consider while deciding which column to use for a split,
*
* @param array $features
*/
protected function setSelectedFeatures(array $selectedFeatures)
{
$this->selectedFeatures = $selectedFeatures;
}
/** /**
* A string array to represent columns. Useful when HTML output or * A string array to represent columns. Useful when HTML output or
* column importances are desired to be inspected. * column importances are desired to be inspected.

View File

@ -0,0 +1,148 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification\Linear;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
use Phpml\Classification\Classifier;
use Phpml\Classification\Linear\Perceptron;
use Phpml\Preprocessing\Normalizer;
class Adaline extends Perceptron
{
/**
* Batch training is the default Adaline training algorithm
*/
const BATCH_TRAINING = 1;
/**
* Online training: Stochastic gradient descent learning
*/
const ONLINE_TRAINING = 2;
/**
* The function whose result will be used to calculate the network error
* for each instance
*
* @var string
*/
protected static $errorFunction = 'output';
/**
* Training type may be either 'Batch' or 'Online' learning
*
* @var string
*/
protected $trainingType;
/**
* @var Normalizer
*/
private $normalizer;
/**
* Initalize an Adaline (ADAptive LInear NEuron) classifier with given learning rate and maximum
* number of iterations used while training the classifier <br>
*
* Learning rate should be a float value between 0.0(exclusive) and 1.0 (inclusive) <br>
* Maximum number of iterations can be an integer value greater than 0 <br>
* If normalizeInputs is set to true, then every input given to the algorithm will be standardized
* by use of standard deviation and mean calculation
*
* @param int $learningRate
* @param int $maxIterations
*/
public function __construct(float $learningRate = 0.001, int $maxIterations = 1000,
bool $normalizeInputs = true, int $trainingType = self::BATCH_TRAINING)
{
if ($normalizeInputs) {
$this->normalizer = new Normalizer(Normalizer::NORM_STD);
}
if (! in_array($trainingType, [self::BATCH_TRAINING, self::ONLINE_TRAINING])) {
throw new \Exception("Adaline can only be trained with batch and online/stochastic gradient descent algorithm");
}
$this->trainingType = $trainingType;
parent::__construct($learningRate, $maxIterations);
}
/**
* @param array $samples
* @param array $targets
*/
public function train(array $samples, array $targets)
{
if ($this->normalizer) {
$this->normalizer->transform($samples);
}
parent::train($samples, $targets);
}
/**
* Adapts the weights with respect to given samples and targets
* by use of gradient descent learning rule
*/
protected function runTraining()
{
// If online training is chosen, then the parent runTraining method
// will be executed with the 'output' method as the error function
if ($this->trainingType == self::ONLINE_TRAINING) {
return parent::runTraining();
}
// Batch learning is executed:
$currIter = 0;
while ($this->maxIterations > $currIter++) {
$outputs = array_map([$this, 'output'], $this->samples);
$updates = array_map([$this, 'gradient'], $this->targets, $outputs);
$sum = array_sum($updates);
// Updates all weights at once
for ($i=0; $i <= $this->featureCount; $i++) {
if ($i == 0) {
$this->weights[0] += $this->learningRate * $sum;
} else {
$col = array_column($this->samples, $i - 1);
$error = 0;
foreach ($col as $index => $val) {
$error += $val * $updates[$index];
}
$this->weights[$i] += $this->learningRate * $error;
}
}
}
}
/**
* Returns the direction of gradient given the desired and actual outputs
*
* @param int $desired
* @param int $output
* @return int
*/
protected function gradient($desired, $output)
{
return $desired - $output;
}
/**
* @param array $sample
* @return mixed
*/
public function predictSample(array $sample)
{
if ($this->normalizer) {
$samples = [$sample];
$this->normalizer->transform($samples);
$sample = $samples[0];
}
return parent::predictSample($sample);
}
}

View File

@ -0,0 +1,56 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification\Linear;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
use Phpml\Classification\Classifier;
use Phpml\Classification\DecisionTree;
class DecisionStump extends DecisionTree
{
use Trainable, Predictable;
/**
* @var int
*/
protected $columnIndex;
/**
* A DecisionStump classifier is a one-level deep DecisionTree. It is generally
* used with ensemble algorithms as in the weak classifier role. <br>
*
* If columnIndex is given, then the stump tries to produce a decision node
* on this column, otherwise in cases given the value of -1, the stump itself
* decides which column to take for the decision (Default DecisionTree behaviour)
*
* @param int $columnIndex
*/
public function __construct(int $columnIndex = -1)
{
$this->columnIndex = $columnIndex;
parent::__construct(1);
}
/**
* @param array $samples
* @param array $targets
*/
public function train(array $samples, array $targets)
{
// Check if a column index was given
if ($this->columnIndex >= 0 && $this->columnIndex > count($samples[0]) - 1) {
$this->columnIndex = -1;
}
if ($this->columnIndex >= 0) {
$this->setSelectedFeatures([$this->columnIndex]);
}
parent::train($samples, $targets);
}
}

View File

@ -0,0 +1,174 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification\Linear;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
use Phpml\Classification\Classifier;
class Perceptron implements Classifier
{
use Predictable;
/**
* The function whose result will be used to calculate the network error
* for each instance
*
* @var string
*/
protected static $errorFunction = 'outputClass';
/**
* @var array
*/
protected $samples = [];
/**
* @var array
*/
protected $targets = [];
/**
* @var array
*/
protected $labels = [];
/**
* @var int
*/
protected $featureCount = 0;
/**
* @var array
*/
protected $weights;
/**
* @var float
*/
protected $learningRate;
/**
* @var int
*/
protected $maxIterations;
/**
* Initalize a perceptron classifier with given learning rate and maximum
* number of iterations used while training the perceptron <br>
*
* Learning rate should be a float value between 0.0(exclusive) and 1.0(inclusive) <br>
* Maximum number of iterations can be an integer value greater than 0
* @param int $learningRate
* @param int $maxIterations
*/
public function __construct(float $learningRate = 0.001, int $maxIterations = 1000)
{
if ($learningRate <= 0.0 || $learningRate > 1.0) {
throw new \Exception("Learning rate should be a float value between 0.0(exclusive) and 1.0(inclusive)");
}
if ($maxIterations <= 0) {
throw new \Exception("Maximum number of iterations should be an integer greater than 0");
}
$this->learningRate = $learningRate;
$this->maxIterations = $maxIterations;
}
/**
* @param array $samples
* @param array $targets
*/
public function train(array $samples, array $targets)
{
$this->labels = array_keys(array_count_values($targets));
if (count($this->labels) > 2) {
throw new \Exception("Perceptron is for only binary (two-class) classification");
}
// Set all target values to either -1 or 1
$this->labels = [1 => $this->labels[0], -1 => $this->labels[1]];
foreach ($targets as $target) {
$this->targets[] = $target == $this->labels[1] ? 1 : -1;
}
// Set samples and feature count vars
$this->samples = array_merge($this->samples, $samples);
$this->featureCount = count($this->samples[0]);
// Init weights with random values
$this->weights = array_fill(0, $this->featureCount + 1, 0);
foreach ($this->weights as &$weight) {
$weight = rand() / (float) getrandmax();
}
// Do training
$this->runTraining();
}
/**
* Adapts the weights with respect to given samples and targets
* by use of perceptron learning rule
*/
protected function runTraining()
{
$currIter = 0;
while ($this->maxIterations > $currIter++) {
foreach ($this->samples as $index => $sample) {
$target = $this->targets[$index];
$prediction = $this->{static::$errorFunction}($sample);
$update = $target - $prediction;
// Update bias
$this->weights[0] += $update * $this->learningRate; // Bias
// Update other weights
for ($i=1; $i <= $this->featureCount; $i++) {
$this->weights[$i] += $update * $sample[$i - 1] * $this->learningRate;
}
}
}
}
/**
* Calculates net output of the network as a float value for the given input
*
* @param array $sample
* @return int
*/
protected function output(array $sample)
{
$sum = 0;
foreach ($this->weights as $index => $w) {
if ($index == 0) {
$sum += $w;
} else {
$sum += $w * $sample[$index - 1];
}
}
return $sum;
}
/**
* Returns the class value (either -1 or 1) for the given input
*
* @param array $sample
* @return int
*/
protected function outputClass(array $sample)
{
return $this->output($sample) > 0 ? 1 : -1;
}
/**
* @param array $sample
* @return mixed
*/
protected function predictSample(array $sample)
{
$predictedClass = $this->outputClass($sample);
return $this->labels[ $predictedClass ];
}
}

View File

@ -5,17 +5,35 @@ declare(strict_types=1);
namespace Phpml\Preprocessing; namespace Phpml\Preprocessing;
use Phpml\Exception\NormalizerException; use Phpml\Exception\NormalizerException;
use Phpml\Math\Statistic\StandardDeviation;
use Phpml\Math\Statistic\Mean;
class Normalizer implements Preprocessor class Normalizer implements Preprocessor
{ {
const NORM_L1 = 1; const NORM_L1 = 1;
const NORM_L2 = 2; const NORM_L2 = 2;
const NORM_STD= 3;
/** /**
* @var int * @var int
*/ */
private $norm; private $norm;
/**
* @var bool
*/
private $fitted = false;
/**
* @var array
*/
private $std;
/**
* @var array
*/
private $mean;
/** /**
* @param int $norm * @param int $norm
* *
@ -23,7 +41,7 @@ class Normalizer implements Preprocessor
*/ */
public function __construct(int $norm = self::NORM_L2) public function __construct(int $norm = self::NORM_L2)
{ {
if (!in_array($norm, [self::NORM_L1, self::NORM_L2])) { if (!in_array($norm, [self::NORM_L1, self::NORM_L2, self::NORM_STD])) {
throw NormalizerException::unknownNorm(); throw NormalizerException::unknownNorm();
} }
@ -35,7 +53,20 @@ class Normalizer implements Preprocessor
*/ */
public function fit(array $samples) public function fit(array $samples)
{ {
// intentionally not implemented if ($this->fitted) {
return;
}
if ($this->norm == self::NORM_STD) {
$features = range(0, count($samples[0]) - 1);
foreach ($features as $i) {
$values = array_column($samples, $i);
$this->std[$i] = StandardDeviation::population($values);
$this->mean[$i] = Mean::arithmetic($values);
}
}
$this->fitted = true;
} }
/** /**
@ -43,7 +74,15 @@ class Normalizer implements Preprocessor
*/ */
public function transform(array &$samples) public function transform(array &$samples)
{ {
$method = sprintf('normalizeL%s', $this->norm); $methods = [
self::NORM_L1 => 'normalizeL1',
self::NORM_L2 => 'normalizeL2',
self::NORM_STD=> 'normalizeSTD'
];
$method = $methods[$this->norm];
$this->fit($samples);
foreach ($samples as &$sample) { foreach ($samples as &$sample) {
$this->$method($sample); $this->$method($sample);
} }
@ -88,4 +127,14 @@ class Normalizer implements Preprocessor
} }
} }
} }
/**
* @param array $sample
*/
private function normalizeSTD(array &$sample)
{
foreach ($sample as $i => $val) {
$sample[$i] = ($sample[$i] - $this->mean[$i]) / $this->std[$i];
}
}
} }

View File

@ -0,0 +1,55 @@
<?php
declare(strict_types=1);
namespace tests\Classification\Linear;
use Phpml\Classification\Linear\Adaline;
use Phpml\ModelManager;
use PHPUnit\Framework\TestCase;
class AdalineTest extends TestCase
{
public function testPredictSingleSample()
{
// AND problem
$samples = [[0, 0], [1, 0], [0, 1], [1, 1]];
$targets = [0, 0, 0, 1];
$classifier = new Adaline();
$classifier->train($samples, $targets);
$this->assertEquals(0, $classifier->predict([0.1, 0.2]));
$this->assertEquals(0, $classifier->predict([0.1, 0.99]));
$this->assertEquals(1, $classifier->predict([1.1, 0.8]));
// OR problem
$samples = [[0, 0], [1, 0], [0, 1], [1, 1]];
$targets = [0, 1, 1, 1];
$classifier = new Adaline();
$classifier->train($samples, $targets);
$this->assertEquals(0, $classifier->predict([0.1, 0.2]));
$this->assertEquals(1, $classifier->predict([0.1, 0.99]));
$this->assertEquals(1, $classifier->predict([1.1, 0.8]));
return $classifier;
}
public function testSaveAndRestore()
{
// Instantinate new Percetron trained for OR problem
$samples = [[0, 0], [1, 0], [0, 1], [1, 1]];
$targets = [0, 1, 1, 1];
$classifier = new Adaline();
$classifier->train($samples, $targets);
$testSamples = [[0, 1], [1, 1], [0.2, 0.1]];
$predicted = $classifier->predict($testSamples);
$filename = 'adaline-test-'.rand(100, 999).'-'.uniqid();
$filepath = tempnam(sys_get_temp_dir(), $filename);
$modelManager = new ModelManager();
$modelManager->saveToFile($classifier, $filepath);
$restoredClassifier = $modelManager->restoreFromFile($filepath);
$this->assertEquals($classifier, $restoredClassifier);
$this->assertEquals($predicted, $restoredClassifier->predict($testSamples));
}
}

View File

@ -0,0 +1,59 @@
<?php
declare(strict_types=1);
namespace tests\Classification\Linear;
use Phpml\Classification\Linear\DecisionStump;
use Phpml\ModelManager;
use PHPUnit\Framework\TestCase;
class DecisionStumpTest extends TestCase
{
public function testPredictSingleSample()
{
// Samples should be separable with a line perpendicular to any dimension
// given in the dataset
// First: horizontal test
$samples = [[0, 0], [1, 0], [0, 1], [1, 1]];
$targets = [0, 0, 1, 1];
$classifier = new DecisionStump();
$classifier->train($samples, $targets);
$this->assertEquals(0, $classifier->predict([0.1, 0.2]));
$this->assertEquals(0, $classifier->predict([1.1, 0.2]));
$this->assertEquals(1, $classifier->predict([0.1, 0.99]));
$this->assertEquals(1, $classifier->predict([1.1, 0.8]));
// Then: vertical test
$samples = [[0, 0], [1, 0], [0, 1], [1, 1]];
$targets = [0, 1, 0, 1];
$classifier = new DecisionStump();
$classifier->train($samples, $targets);
$this->assertEquals(0, $classifier->predict([0.1, 0.2]));
$this->assertEquals(0, $classifier->predict([0.1, 1.1]));
$this->assertEquals(1, $classifier->predict([1.0, 0.99]));
$this->assertEquals(1, $classifier->predict([1.1, 0.1]));
return $classifier;
}
public function testSaveAndRestore()
{
// Instantinate new Percetron trained for OR problem
$samples = [[0, 0], [1, 0], [0, 1], [1, 1]];
$targets = [0, 1, 1, 1];
$classifier = new DecisionStump();
$classifier->train($samples, $targets);
$testSamples = [[0, 1], [1, 1], [0.2, 0.1]];
$predicted = $classifier->predict($testSamples);
$filename = 'dstump-test-'.rand(100, 999).'-'.uniqid();
$filepath = tempnam(sys_get_temp_dir(), $filename);
$modelManager = new ModelManager();
$modelManager->saveToFile($classifier, $filepath);
$restoredClassifier = $modelManager->restoreFromFile($filepath);
$this->assertEquals($classifier, $restoredClassifier);
$this->assertEquals($predicted, $restoredClassifier->predict($testSamples));
}
}

View File

@ -0,0 +1,55 @@
<?php
declare(strict_types=1);
namespace tests\Classification\Linear;
use Phpml\Classification\Linear\Perceptron;
use Phpml\ModelManager;
use PHPUnit\Framework\TestCase;
class PerceptronTest extends TestCase
{
public function testPredictSingleSample()
{
// AND problem
$samples = [[0, 0], [1, 0], [0, 1], [1, 1], [0.9, 0.8]];
$targets = [0, 0, 0, 1, 1];
$classifier = new Perceptron(0.001, 5000);
$classifier->train($samples, $targets);
$this->assertEquals(0, $classifier->predict([0.1, 0.2]));
$this->assertEquals(0, $classifier->predict([0.1, 0.99]));
$this->assertEquals(1, $classifier->predict([1.1, 0.8]));
// OR problem
$samples = [[0, 0], [0.1, 0.2], [1, 0], [0, 1], [1, 1]];
$targets = [0, 0, 1, 1, 1];
$classifier = new Perceptron(0.001, 5000);
$classifier->train($samples, $targets);
$this->assertEquals(0, $classifier->predict([0, 0]));
$this->assertEquals(1, $classifier->predict([0.1, 0.99]));
$this->assertEquals(1, $classifier->predict([1.1, 0.8]));
return $classifier;
}
public function testSaveAndRestore()
{
// Instantinate new Percetron trained for OR problem
$samples = [[0, 0], [1, 0], [0, 1], [1, 1]];
$targets = [0, 1, 1, 1];
$classifier = new Perceptron();
$classifier->train($samples, $targets);
$testSamples = [[0, 1], [1, 1], [0.2, 0.1]];
$predicted = $classifier->predict($testSamples);
$filename = 'perceptron-test-'.rand(100, 999).'-'.uniqid();
$filepath = tempnam(sys_get_temp_dir(), $filename);
$modelManager = new ModelManager();
$modelManager->saveToFile($classifier, $filepath);
$restoredClassifier = $modelManager->restoreFromFile($filepath);
$this->assertEquals($classifier, $restoredClassifier);
$this->assertEquals($predicted, $restoredClassifier->predict($testSamples));
}
}

View File

@ -100,4 +100,32 @@ class NormalizerTest extends TestCase
$this->assertEquals($normalized, $samples, '', $delta = 0.01); $this->assertEquals($normalized, $samples, '', $delta = 0.01);
} }
public function testStandardNorm()
{
// Generate 10 random vectors of length 3
$samples = [];
srand(time());
for ($i=0; $i<10; $i++) {
$sample = array_fill(0, 3, 0);
for ($k=0; $k<3; $k++) {
$sample[$k] = rand(1, 100);
}
$samples[] = $sample;
}
// Use standard normalization
$normalizer = new Normalizer(Normalizer::NORM_STD);
$normalizer->transform($samples);
// Values in the vector should be some value between -3 and +3
$this->assertCount(10, $samples);
foreach ($samples as $sample) {
$errors = array_filter($sample,
function ($element) {
return $element < -3 || $element > 3;
});
$this->assertCount(0, $errors);
}
}
} }