From 325427c72382dbe8c2ce6ba7490959f8ff6b9e35 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sat, 14 May 2016 21:30:13 +0200 Subject: [PATCH] update missing docs --- .../imputation-missing-values.md | 44 +++++++++++ .../preprocessing/normalization.md | 58 ++++++++++++++ docs/math/statistic.md | 79 ++++++++++++++++++- 3 files changed, 178 insertions(+), 3 deletions(-) diff --git a/docs/machine-learning/preprocessing/imputation-missing-values.md b/docs/machine-learning/preprocessing/imputation-missing-values.md index db64d8d..186f424 100644 --- a/docs/machine-learning/preprocessing/imputation-missing-values.md +++ b/docs/machine-learning/preprocessing/imputation-missing-values.md @@ -1 +1,45 @@ # Imputation missing values + +For various reasons, many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders. +To solve this problem you can use the `Imputer` class. + +## Constructor Parameters + +* $missingValue (mixed) - this value will be replaced (default null) +* $strategy (Strategy) - imputation strategy (read to use: MeanStrategy, MedianStrategy, MostFrequentStrategy) +* $axis (int) - axis for strategy, Imputer::AXIS_COLUMN or Imputer::AXIS_ROW + +``` +$imputer = new Imputer(null, new MeanStrategy(), Imputer::AXIS_COLUMN); +$imputer = new Imputer(null, new MedianStrategy(), Imputer::AXIS_ROW); +``` + +## Strategy + +* MeanStrategy - replace missing values using the mean along the axis +* MedianStrategy - replace missing values using the median along the axis +* MostFrequentStrategy - replace missing using the most frequent value along the axis + +## Example of use + +``` +$data = [ + [1, null, 3, 4], + [4, 3, 2, 1], + [null, 6, 7, 8], + [8, 7, null, 5], +]; + +$imputer = new Imputer(null, new MeanStrategy(), Imputer::AXIS_COLUMN); +$imputer->preprocess($data); + +/* +$data = [ + [1, 5.33, 3, 4], + [4, 3, 2, 1], + [4.33, 6, 7, 8], + [8, 7, 4, 5], +]; +*/ + +``` diff --git a/docs/machine-learning/preprocessing/normalization.md b/docs/machine-learning/preprocessing/normalization.md index a0dbc80..61b1a8d 100644 --- a/docs/machine-learning/preprocessing/normalization.md +++ b/docs/machine-learning/preprocessing/normalization.md @@ -1 +1,59 @@ # Normalization + +Normalization is the process of scaling individual samples to have unit norm. + +## L2 norm + +[http://mathworld.wolfram.com/L2-Norm.html](http://mathworld.wolfram.com/L2-Norm.html) + +Example: + +``` +use Phpml\Preprocessing\Normalizer; + +$samples = [ + [1, -1, 2], + [2, 0, 0], + [0, 1, -1], +]; + +$normalizer = new Normalizer(); +$normalizer->preprocess($samples); + +/* +$samples = [ + [0.4, -0.4, 0.81], + [1.0, 0.0, 0.0], + [0.0, 0.7, -0.7], +]; +*/ + +``` + +## L1 norm + +[http://mathworld.wolfram.com/L1-Norm.html](http://mathworld.wolfram.com/L1-Norm.html) + +Example: + +``` +use Phpml\Preprocessing\Normalizer; + +$samples = [ + [1, -1, 2], + [2, 0, 0], + [0, 1, -1], +]; + +$normalizer = new Normalizer(Normalizer::NORM_L1); +$normalizer->preprocess($samples); + +/* +$samples = [ + [0.25, -0.25, 0.5], + [1.0, 0.0, 0.0], + [0.0, 0.5, -0.5], +]; +*/ + +``` diff --git a/docs/math/statistic.md b/docs/math/statistic.md index 89cc00e..626828e 100644 --- a/docs/math/statistic.md +++ b/docs/math/statistic.md @@ -1,7 +1,80 @@ # Statistic -### Correlation +Selected statistical methods. -### Mean +## Correlation -### Standard Deviation +Correlation coefficients are used in statistics to measure how strong a relationship is between two variables. There are several types of correlation coefficient. + +### Pearson correlation + +Pearson’s correlation or Pearson correlation is a correlation coefficient commonly used in linear regression. + +Example: + +``` +use Phpml\Math\Statistic\Correlation; + +$x = [43, 21, 25, 42, 57, 59]; +$y = [99, 65, 79, 75, 87, 82]; + +Correlation::pearson($x, $y); +// return 0.549 +``` + +## Mean + +### Arithmetic + +Example: + +``` +use Phpml\Math\Statistic\Mean; + +Mean::arithmetic([2, 5]; +// return 3.5 + +Mean::arithmetic([0.5, 0.5, 1.5, 2.5, 3.5]; +// return 1.7 +``` + +## Median + +Example: + +``` +use Phpml\Math\Statistic\Mean; + +Mean::median([5, 2, 6, 1, 3, 4]); +// return 3.5 + +Mean::median([5, 2, 6, 1, 3]); +// return 3 +``` + +## Mode + +Example: + +``` +use Phpml\Math\Statistic\Mean; + +Mean::mode([5, 2, 6, 1, 3, 4, 6, 6, 5]); +// return 6 +``` + +## Standard Deviation + +Example: + +``` +use Phpml\Math\Statistic\StandardDeviation; + +$population = [5, 6, 8, 9]; +StandardDeviation::population($population) +// return 1.825 + +$population = [7100, 15500, 4400, 4400, 5900, 4600, 8800, 2000, 2750, 2550, 960, 1025]; +StandardDeviation::population($population) +// return 4079 +```