samples = $samples; $this->targets = $targets; $this->sampleCount = count($samples); $this->featureCount = count($samples[0]); $labelCounts = array_count_values($targets); $this->labels = array_keys($labelCounts); foreach ($this->labels as $label) { $samples = $this->getSamplesByLabel($label); $this->p[$label] = count($samples) / $this->sampleCount; $this->calculateStatistics($label, $samples); } } /** * Calculates vital statistics for each label & feature. Stores these * values in private array in order to avoid repeated calculation * @param string $label * @param array $samples */ private function calculateStatistics($label, $samples) { $this->std[$label] = array_fill(0, $this->featureCount, 0); $this->mean[$label]= array_fill(0, $this->featureCount, 0); $this->dataType[$label] = array_fill(0, $this->featureCount, self::CONTINUOS); $this->discreteProb[$label] = array_fill(0, $this->featureCount, self::CONTINUOS); for ($i=0; $i<$this->featureCount; $i++) { // Get the values of nth column in the samples array // Mean::arithmetic is called twice, can be optimized $values = array_column($samples, $i); $numValues = count($values); // if the values contain non-numeric data, // then it should be treated as nominal/categorical/discrete column if ($values !== array_filter($values, 'is_numeric')) { $this->dataType[$label][$i] = self::NOMINAL; $this->discreteProb[$label][$i] = array_count_values($values); $db = &$this->discreteProb[$label][$i]; $db = array_map(function ($el) use ($numValues) { return $el / $numValues; }, $db); } else { $this->mean[$label][$i] = Mean::arithmetic($values); // Add epsilon in order to avoid zero stdev $this->std[$label][$i] = 1e-10 + StandardDeviation::population($values, false); } } } /** * Calculates the probability P(label|sample_n) * * @param array $sample * @param int $feature * @param string $label * @return float */ private function sampleProbability($sample, $feature, $label) { $value = $sample[$feature]; if ($this->dataType[$label][$feature] == self::NOMINAL) { if (! isset($this->discreteProb[$label][$feature][$value]) || $this->discreteProb[$label][$feature][$value] == 0) { return self::EPSILON; } return $this->discreteProb[$label][$feature][$value]; } $std = $this->std[$label][$feature] ; $mean= $this->mean[$label][$feature]; // Calculate the probability density by use of normal/Gaussian distribution // Ref: https://en.wikipedia.org/wiki/Normal_distribution // // In order to avoid numerical errors because of small or zero values, // some libraries adopt taking log of calculations such as // scikit-learn did. // (See : https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/naive_bayes.py) $pdf = -0.5 * log(2.0 * pi() * $std * $std); $pdf -= 0.5 * pow($value - $mean, 2) / ($std * $std); return $pdf; } /** * Return samples belonging to specific label * @param string $label * @return array */ private function getSamplesByLabel($label) { $samples = array(); for ($i=0; $i<$this->sampleCount; $i++) { if ($this->targets[$i] == $label) { $samples[] = $this->samples[$i]; } } return $samples; } /** * @param array $sample * @return mixed */ protected function predictSample(array $sample) { // Use NaiveBayes assumption for each label using: // P(label|features) = P(label) * P(feature0|label) * P(feature1|label) .... P(featureN|label) // Then compare probability for each class to determine which label is most likely $predictions = array(); foreach ($this->labels as $label) { $p = $this->p[$label]; for ($i=0; $i<$this->featureCount; $i++) { $Plf = $this->sampleProbability($sample, $i, $label); $p += $Plf; } $predictions[$label] = $p; } arsort($predictions, SORT_NUMERIC); reset($predictions); return key($predictions); } }