From e603d60841ba18383142940612269e336cab3da6 Mon Sep 17 00:00:00 2001 From: Mustafa Karabulut Date: Tue, 17 Jan 2017 17:21:58 +0200 Subject: [PATCH] Update NaiveBayes.php (#30) * Update NaiveBayes.php * Update NaiveBayes.php * Update NaiveBayes.php Update to fix "predictSample" function to enable it handle samples given as multi-dimensional arrays. * Update NaiveBayes.php * Update NaiveBayes.php --- src/Phpml/Classification/NaiveBayes.php | 149 +++++++++++++++++++++--- 1 file changed, 135 insertions(+), 14 deletions(-) diff --git a/src/Phpml/Classification/NaiveBayes.php b/src/Phpml/Classification/NaiveBayes.php index 41441be..17b0fbe 100644 --- a/src/Phpml/Classification/NaiveBayes.php +++ b/src/Phpml/Classification/NaiveBayes.php @@ -6,31 +6,152 @@ namespace Phpml\Classification; use Phpml\Helper\Predictable; use Phpml\Helper\Trainable; +use Phpml\Math\Statistic\Mean; +use Phpml\Math\Statistic\StandardDeviation; class NaiveBayes implements Classifier { use Trainable, Predictable; + const CONTINUOS = 1; + const NOMINAL = 2; + const EPSILON = 1e-10; + private $std = array(); + private $mean= array(); + private $discreteProb = array(); + private $dataType = array(); + private $p = array(); + private $sampleCount = 0; + private $featureCount = 0; + private $labels = array(); + public function train(array $samples, array $targets) + { + $this->samples = $samples; + $this->targets = $targets; + $this->sampleCount = count($samples); + $this->featureCount = count($samples[0]); + // Get distinct targets + $this->labels = $targets; + array_unique($this->labels); + foreach ($this->labels as $label) { + $samples = $this->getSamplesByLabel($label); + $this->p[$label] = count($samples) / $this->sampleCount; + $this->calculateStatistics($label, $samples); + } + } + + /** + * Calculates vital statistics for each label & feature. Stores these + * values in private array in order to avoid repeated calculation + * @param string $label + * @param array $samples + */ + private function calculateStatistics($label, $samples) + { + $this->std[$label] = array_fill(0, $this->featureCount, 0); + $this->mean[$label]= array_fill(0, $this->featureCount, 0); + $this->dataType[$label] = array_fill(0, $this->featureCount, self::CONTINUOS); + $this->discreteProb[$label] = array_fill(0, $this->featureCount, self::CONTINUOS); + for ($i=0; $i<$this->featureCount; $i++) { + // Get the values of nth column in the samples array + // Mean::arithmetic is called twice, can be optimized + $values = array_column($samples, $i); + $numValues = count($values); + // if the values contain non-numeric data, + // then it should be treated as nominal/categorical/discrete column + if ($values !== array_filter($values, 'is_numeric')) { + $this->dataType[$label][$i] = self::NOMINAL; + $this->discreteProb[$label][$i] = array_count_values($values); + $db = &$this->discreteProb[$label][$i]; + $db = array_map(function ($el) use ($numValues) { + return $el / $numValues; + }, $db); + } else { + $this->mean[$label][$i] = Mean::arithmetic($values); + // Add epsilon in order to avoid zero stdev + $this->std[$label][$i] = 1e-10 + StandardDeviation::population($values, false); + } + } + } + + /** + * Calculates the probability P(label|sample_n) + * + * @param array $sample + * @param int $feature + * @param string $label + */ + private function sampleProbability($sample, $feature, $label) + { + $value = $sample[$feature]; + if ($this->dataType[$label][$feature] == self::NOMINAL) { + if (! isset($this->discreteProb[$label][$feature][$value]) || + $this->discreteProb[$label][$feature][$value] == 0) { + return self::EPSILON; + } + return $this->discreteProb[$label][$feature][$value]; + } + $std = $this->std[$label][$feature] ; + $mean= $this->mean[$label][$feature]; + // Calculate the probability density by use of normal/Gaussian distribution + // Ref: https://en.wikipedia.org/wiki/Normal_distribution + // + // In order to avoid numerical errors because of small or zero values, + // some libraries adopt taking log of calculations such as + // scikit-learn did. + // (See : https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/naive_bayes.py) + $pdf = -0.5 * log(2.0 * pi() * $std * $std); + $pdf -= 0.5 * pow($value - $mean, 2) / ($std * $std); + return $pdf; + } + + /** + * Return samples belonging to specific label + * @param string $label + * @return array + */ + private function getSamplesByLabel($label) + { + $samples = array(); + for ($i=0; $i<$this->sampleCount; $i++) { + if ($this->targets[$i] == $label) { + $samples[] = $this->samples[$i]; + } + } + return $samples; + } /** * @param array $sample - * * @return mixed */ protected function predictSample(array $sample) { - $predictions = []; - foreach ($this->targets as $index => $label) { - $predictions[$label] = 0; - foreach ($sample as $token => $count) { - if (array_key_exists($token, $this->samples[$index])) { - $predictions[$label] += $count * $this->samples[$index][$token]; - } - } + $isArray = is_array($sample[0]); + $samples = $sample; + if (!$isArray) { + $samples = array($sample); } - - arsort($predictions, SORT_NUMERIC); - reset($predictions); - - return key($predictions); + $samplePredictions = array(); + foreach ($samples as $sample) { + // Use NaiveBayes assumption for each label using: + // P(label|features) = P(label) * P(feature0|label) * P(feature1|label) .... P(featureN|label) + // Then compare probability for each class to determine which label is most likely + $predictions = array(); + foreach ($this->labels as $label) { + $p = $this->p[$label]; + for ($i=0; $i<$this->featureCount; $i++) { + $Plf = $this->sampleProbability($sample, $i, $label); + $p += $Plf; + } + $predictions[$label] = $p; + } + arsort($predictions, SORT_NUMERIC); + reset($predictions); + $samplePredictions[] = key($predictions); + } + if (! $isArray) { + return $samplePredictions[0]; + } + return $samplePredictions; } }