Merge pull request #5 from php-ai/develop

SVM and Preprocessing tools
2024-09-28 15:09:01 +00:00 · 2016-05-14 21:39:23 +02:00 · 2016-05-14 21:39:23 +02:00 · 2393b9f137
commit 2393b9f137
parent 096db0e9dd 325427c723
68 changed files with 2184 additions and 143 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,3 @@
 /vendor/
+humbuglog.*
+/bin/phpunit
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-# PHP Machine Learning library
+# PHP-ML - Machine Learning library for PHP

 [![Build Status](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/build.png?b=develop)](https://scrutinizer-ci.com/g/php-ai/php-ml/build-status/develop)
 [![Documentation Status](https://readthedocs.org/projects/php-ml/badge/?version=develop)](http://php-ml.readthedocs.org/en/develop/?badge=develop)
@ -37,22 +37,32 @@ composer require php-ai/php-ml
 ## Features

 * Classification
+    * [SVC](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/svc/)
    * [k-Nearest Neighbors](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/k-nearest-neighbors/)
    * [Naive Bayes](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/naive-bayes/)
 * Regression
    * [Least Squares](http://php-ml.readthedocs.io/en/latest/machine-learning/regression/least-squares/)
+    * [SVR](http://php-ml.readthedocs.io/en/latest/machine-learning/regression/svr/)
 * Clustering
    * [k-Means](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/k-means)
    * [DBSCAN](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/dbscan)
 * Cross Validation
    * [Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/random-split)
+* Preprocessing
+    * [Normalization](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/normalization)
+    * [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values)
+* Feature Extraction
+    * [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer)
 * Datasets
    * [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset)
    * Ready to use:
        * [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/)
+        * [Wine](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/wine/)
+        * [Glass](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/glass/)
 * Math
    * [Distance](http://php-ml.readthedocs.io/en/latest/math/distance/)
    * [Matrix](http://php-ml.readthedocs.io/en/latest/math/matrix/)
+    * [Statistic](http://php-ml.readthedocs.io/en/latest/math/statistic/)
    

 ## Contribute
--- a/bin/libsvm/svm-predict
+++ b/bin/libsvm/svm-predict
--- a/bin/libsvm/svm-predict.exe
+++ b/bin/libsvm/svm-predict.exe
--- a/bin/libsvm/svm-scale
+++ b/bin/libsvm/svm-scale
--- a/bin/libsvm/svm-scale.exe
+++ b/bin/libsvm/svm-scale.exe
--- a/bin/libsvm/svm-train
+++ b/bin/libsvm/svm-train
--- a/bin/libsvm/svm-train.exe
+++ b/bin/libsvm/svm-train.exe
--- a/composer.json
+++ b/composer.json
@ -1,7 +1,7 @@
 {
    "name": "php-ai/php-ml",
    "type": "library",
-    "description": "PHP Machine Learning library",
+    "description": "PHP-ML - Machine Learning library for PHP",
    "license": "MIT",
    "keywords": ["machine learning","pattern recognition","computational learning theory","artificial intelligence"],
    "homepage": "https://github.com/php-ai/php-ml",
@ -16,13 +16,13 @@
            "Phpml": "src/"
        }
    },
-    "config": {
-        "bin-dir": "bin"
-    },
    "require": {
        "php": ">=7.0.0"
    },
    "require-dev": {
        "phpunit/phpunit": "^5.2"
+    },
+    "config": {
+        "bin-dir": "bin"
    }
 }
--- a/composer.lock
+++ b/composer.lock
@ -4,7 +4,7 @@
        "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
        "This file is @generated automatically"
    ],
-    "hash": "7c34eebd6b8749a1cd09df57e5d1f47a",
+    "hash": "f3e2d9975d300b3ea4c3568de44d8499",
    "content-hash": "087091d0c339e9fa3a551a189ea658bf",
    "packages": [],
    "packages-dev": [
@ -64,16 +64,16 @@
        },
        {
            "name": "myclabs/deep-copy",
-            "version": "1.5.0",
+            "version": "1.5.1",
            "source": {
                "type": "git",
                "url": "https://github.com/myclabs/DeepCopy.git",
-                "reference": "e3abefcd7f106677fd352cd7c187d6c969aa9ddc"
+                "reference": "a8773992b362b58498eed24bf85005f363c34771"
            },
            "dist": {
                "type": "zip",
-                "url": "https://api.github.com/repos/myclabs/DeepCopy/zipball/e3abefcd7f106677fd352cd7c187d6c969aa9ddc",
-                "reference": "e3abefcd7f106677fd352cd7c187d6c969aa9ddc",
+                "url": "https://api.github.com/repos/myclabs/DeepCopy/zipball/a8773992b362b58498eed24bf85005f363c34771",
+                "reference": "a8773992b362b58498eed24bf85005f363c34771",
                "shasum": ""
            },
            "require": {
@ -102,7 +102,7 @@
                "object",
                "object graph"
            ],
-            "time": "2015-11-07 22:20:37"
+            "time": "2015-11-20 12:04:31"
        },
        {
            "name": "phpdocumentor/reflection-docblock",
@ -217,16 +217,16 @@
        },
        {
            "name": "phpunit/php-code-coverage",
-            "version": "3.3.0",
+            "version": "3.3.1",
            "source": {
                "type": "git",
                "url": "https://github.com/sebastianbergmann/php-code-coverage.git",
-                "reference": "fe33716763b604ade4cb442c0794f5bd5ad73004"
+                "reference": "2431befdd451fac43fbcde94d1a92fb3b8b68f86"
            },
            "dist": {
                "type": "zip",
-                "url": "https://api.github.com/repos/sebastianbergmann/php-code-coverage/zipball/fe33716763b604ade4cb442c0794f5bd5ad73004",
-                "reference": "fe33716763b604ade4cb442c0794f5bd5ad73004",
+                "url": "https://api.github.com/repos/sebastianbergmann/php-code-coverage/zipball/2431befdd451fac43fbcde94d1a92fb3b8b68f86",
+                "reference": "2431befdd451fac43fbcde94d1a92fb3b8b68f86",
                "shasum": ""
            },
            "require": {
@ -244,7 +244,7 @@
            },
            "suggest": {
                "ext-dom": "*",
-                "ext-xdebug": ">=2.2.1",
+                "ext-xdebug": ">=2.4.0",
                "ext-xmlwriter": "*"
            },
            "type": "library",
@ -276,7 +276,7 @@
                "testing",
                "xunit"
            ],
-            "time": "2016-03-03 08:49:08"
+            "time": "2016-04-08 08:14:53"
        },
        {
            "name": "phpunit/php-file-iterator",
@ -458,16 +458,16 @@
        },
        {
            "name": "phpunit/phpunit",
-            "version": "5.3.1",
+            "version": "5.3.2",
            "source": {
                "type": "git",
                "url": "https://github.com/sebastianbergmann/phpunit.git",
-                "reference": "34a3acb401ae79deb37bc6e5f5ec3d325b369b4c"
+                "reference": "2c6da3536035617bae3fe3db37283c9e0eb63ab3"
            },
            "dist": {
                "type": "zip",
-                "url": "https://api.github.com/repos/sebastianbergmann/phpunit/zipball/34a3acb401ae79deb37bc6e5f5ec3d325b369b4c",
-                "reference": "34a3acb401ae79deb37bc6e5f5ec3d325b369b4c",
+                "url": "https://api.github.com/repos/sebastianbergmann/phpunit/zipball/2c6da3536035617bae3fe3db37283c9e0eb63ab3",
+                "reference": "2c6da3536035617bae3fe3db37283c9e0eb63ab3",
                "shasum": ""
            },
            "require": {
@ -529,20 +529,20 @@
                "testing",
                "xunit"
            ],
-            "time": "2016-04-07 07:04:34"
+            "time": "2016-04-12 16:20:08"
        },
        {
            "name": "phpunit/phpunit-mock-objects",
-            "version": "3.1.2",
+            "version": "3.1.3",
            "source": {
                "type": "git",
                "url": "https://github.com/sebastianbergmann/phpunit-mock-objects.git",
-                "reference": "7c34c9bdde4131b824086457a3145e27dba10ca1"
+                "reference": "151c96874bff6fe61a25039df60e776613a61489"
            },
            "dist": {
                "type": "zip",
-                "url": "https://api.github.com/repos/sebastianbergmann/phpunit-mock-objects/zipball/7c34c9bdde4131b824086457a3145e27dba10ca1",
-                "reference": "7c34c9bdde4131b824086457a3145e27dba10ca1",
+                "url": "https://api.github.com/repos/sebastianbergmann/phpunit-mock-objects/zipball/151c96874bff6fe61a25039df60e776613a61489",
+                "reference": "151c96874bff6fe61a25039df60e776613a61489",
                "shasum": ""
            },
            "require": {
@ -585,7 +585,7 @@
                "mock",
                "xunit"
            ],
-            "time": "2016-03-24 05:58:25"
+            "time": "2016-04-20 14:39:26"
        },
        {
            "name": "sebastian/code-unit-reverse-lookup",
@ -750,16 +750,16 @@
        },
        {
            "name": "sebastian/environment",
-            "version": "1.3.5",
+            "version": "1.3.6",
            "source": {
                "type": "git",
                "url": "https://github.com/sebastianbergmann/environment.git",
-                "reference": "dc7a29032cf72b54f36dac15a1ca5b3a1b6029bf"
+                "reference": "2292b116f43c272ff4328083096114f84ea46a56"
            },
            "dist": {
                "type": "zip",
-                "url": "https://api.github.com/repos/sebastianbergmann/environment/zipball/dc7a29032cf72b54f36dac15a1ca5b3a1b6029bf",
-                "reference": "dc7a29032cf72b54f36dac15a1ca5b3a1b6029bf",
+                "url": "https://api.github.com/repos/sebastianbergmann/environment/zipball/2292b116f43c272ff4328083096114f84ea46a56",
+                "reference": "2292b116f43c272ff4328083096114f84ea46a56",
                "shasum": ""
            },
            "require": {
@ -796,7 +796,7 @@
                "environment",
                "hhvm"
            ],
-            "time": "2016-02-26 18:40:46"
+            "time": "2016-05-04 07:59:13"
        },
        {
            "name": "sebastian/exporter",
@ -1101,7 +1101,7 @@
        },
        {
            "name": "symfony/yaml",
-            "version": "v3.0.4",
+            "version": "v3.0.5",
            "source": {
                "type": "git",
                "url": "https://github.com/symfony/yaml.git",
--- a/docs/index.md
+++ b/docs/index.md
@ -1,4 +1,4 @@
-# PHP Machine Learning library
+# PHP-ML - Machine Learning library for PHP

 [![Build Status](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/build.png?b=develop)](https://scrutinizer-ci.com/g/php-ai/php-ml/build-status/develop)
 [![Documentation Status](https://readthedocs.org/projects/php-ml/badge/?version=develop)](http://php-ml.readthedocs.org/en/develop/?badge=develop)
@ -37,22 +37,32 @@ composer require php-ai/php-ml
 ## Features

 * Classification
+    * [SVC](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/svc/)
    * [k-Nearest Neighbors](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/k-nearest-neighbors/)
    * [Naive Bayes](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/naive-bayes/)
 * Regression
    * [Least Squares](http://php-ml.readthedocs.io/en/latest/machine-learning/regression/least-squares/)
+    * [SVR](http://php-ml.readthedocs.io/en/latest/machine-learning/regression/svr/)
 * Clustering
    * [k-Means](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/k-means)
    * [DBSCAN](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/dbscan)
 * Cross Validation
    * [Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/random-split)
+* Preprocessing
+    * [Normalization](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/normalization)
+    * [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values)
+* Feature Extraction
+    * [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer)
 * Datasets
    * [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset)
    * Ready to use:
        * [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/)
+        * [Wine](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/wine/)
+        * [Glass](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/glass/)
 * Math
    * [Distance](http://php-ml.readthedocs.io/en/latest/math/distance/)
    * [Matrix](http://php-ml.readthedocs.io/en/latest/math/matrix/)
+    * [Statistic](http://php-ml.readthedocs.io/en/latest/math/statistic/)
    

 ## Contribute
--- a/docs/machine-learning/classification/svc.md
+++ b/docs/machine-learning/classification/svc.md
@ -0,0 +1,47 @@
+# Support Vector Classification
+
+Classifier implementing Support Vector Machine based on libsvm.
+
+### Constructor Parameters
+
+* $kernel (int) - kernel type to be used in the algorithm (default Kernel::LINEAR)
+* $cost (float) - parameter C of C-SVC (default 1.0)
+* $degree (int) - degree of the Kernel::POLYNOMIAL function (default 3)
+* $gamma (float) - kernel coefficient for ‘Kernel::RBF’, ‘Kernel::POLYNOMIAL’ and ‘Kernel::SIGMOID’. If gamma is ‘null’ then 1/features will be used instead.
+* $coef0 (float) - independent term in kernel function. It is only significant in ‘Kernel::POLYNOMIAL’ and ‘Kernel::SIGMOID’ (default 0.0)
+* $tolerance (float) - tolerance of termination criterion (default 0.001)
+* $cacheSize (int) - cache memory size in MB (default 100)
+* $shrinking (bool) - whether to use the shrinking heuristics (default true)
+* $probabilityEstimates (bool) - whether to enable probability estimates (default false)
+
+```
+$classifier = new SVC(Kernel::LINEAR, $cost = 1000);
+$classifier = new SVC(Kernel::RBF, $cost = 1000, $degree = 3, $gamma = 6);
+```
+
+### Train
+
+To train a classifier simply provide train samples and labels (as `array`). Example:
+
+```
+use Phpml\Classification\SVC;
+use Phpml\SupportVectorMachine\Kernel;
+
+$samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]];
+$labels = ['a', 'a', 'a', 'b', 'b', 'b'];
+
+$classifier = new SVC(Kernel::LINEAR, $cost = 1000);
+$classifier->train($samples, $labels);
+```
+
+### Predict
+
+To predict sample label use `predict` method. You can provide one sample or array of samples:
+
+```
+$classifier->predict([3, 2]);
+// return 'b'
+
+$classifier->predict([[3, 2], [1, 5]]);
+// return ['b', 'a']
+```
--- a/docs/machine-learning/datasets/demo/glass.md
+++ b/docs/machine-learning/datasets/demo/glass.md
@ -0,0 +1,42 @@
+# Glass Dataset
+
+From USA Forensic Science Service; 6 types of glass; defined in terms of their oxide content (i.e. Na, Fe, K, etc)
+
+### Specification
+
+| Classes               | 6     |
+| Samples total         | 214   |
+| Features per sample   | 9     |
+
+Samples per class:
+ * 70 float processed building windows
+ * 17 float processed vehicle windows
+ * 76 non-float processed building windows
+ * 13 containers
+ * 9 tableware
+ * 29 headlamps
+
+### Load
+
+To load Glass dataset simple use:
+
+```
+use Phpml\Dataset\Demo\Glass;
+
+$dataset = new Glass();
+```
+
+### Several samples example
+
+```
+RI: refractive index,Na: Sodium,Mg: Magnesium,Al: Aluminum,Si: Silicon,K: Potassium,Ca: Calcium,Ba: Barium,Fe: Iron,type of glass
+1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.00,building_windows_float_processed
+1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.00,building_windows_float_processed
+1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.00,building_windows_float_processed
+1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.00,building_windows_float_processed
+1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.00,building_windows_float_processed
+1.51596,12.79,3.61,1.62,72.97,0.64,8.07,0.00,0.26,building_windows_float_processed
+1.51743,13.30,3.60,1.14,73.09,0.58,8.17,0.00,0.00,building_windows_float_processed
+1.51756,13.15,3.61,1.05,73.24,0.57,8.24,0.00,0.00,building_windows_float_processed
+1.51918,14.04,3.58,1.37,72.08,0.56,8.30,0.00,0.00,building_windows_float_processed
+```
--- a/docs/machine-learning/datasets/demo/iris.md
+++ b/docs/machine-learning/datasets/demo/iris.md
@ -14,6 +14,8 @@ Most popular and widely available dataset of iris flower measurement and class n
 To load Iris dataset simple use:

 ```
+use Phpml\Dataset\Demo\Iris;
+
 $dataset = new Iris();
 ```

--- a/docs/machine-learning/datasets/demo/wine.md
+++ b/docs/machine-learning/datasets/demo/wine.md
@ -0,0 +1,35 @@
+# Wine Dataset
+
+These data are the results of a chemical analysis of wines grown in the same region in Italy but derived from three different cultivars. The analysis determined the quantities of 13 constituents found in each of the three types of wines. 
+
+### Specification
+
+| Classes               | 3     |
+| Samples per class     | class 1 59; class 2 71; class 3 48    |
+| Samples total         | 178   |
+| Features per sample   | 13     |
+
+### Load
+
+To load Wine dataset simple use:
+
+```
+use Phpml\Dataset\Demo\Wine;
+
+$dataset = new Wine();
+```
+
+### Several samples example
+
+```
+alcohol,malic acid,ash,alcalinity of ash,magnesium,total phenols,flavanoids,nonflavanoid phenols,proanthocyanins,color intensity,hue,OD280/OD315 of diluted wines,proline,class
+14.23,1.71,2.43,15.6,127,2.8,3.06,.28,2.29,5.64,1.04,3.92,1065,1
+13.2,1.78,2.14,11.2,100,2.65,2.76,.26,1.28,4.38,1.05,3.4,1050,1
+13.16,2.36,2.67,18.6,101,2.8,3.24,.3,2.81,5.68,1.03,3.17,1185,1
+14.37,1.95,2.5,16.8,113,3.85,3.49,.24,2.18,7.8,.86,3.45,1480,1
+13.24,2.59,2.87,21,118,2.8,2.69,.39,1.82,4.32,1.04,2.93,735,1
+14.2,1.76,2.45,15.2,112,3.27,3.39,.34,1.97,6.75,1.05,2.85,1450,1
+14.39,1.87,2.45,14.6,96,2.5,2.52,.3,1.98,5.25,1.02,3.58,1290,1
+14.06,2.15,2.61,17.6,121,2.6,2.51,.31,1.25,5.05,1.06,3.58,1295,1
+14.83,1.64,2.17,14,97,2.8,2.98,.29,1.98,5.2,1.08,2.85,1045,1
+```
--- a/docs/machine-learning/feature-extraction/token-count-vectorizer.md
+++ b/docs/machine-learning/feature-extraction/token-count-vectorizer.md
@ -0,0 +1,50 @@
+# Token Count Vectorizer
+
+Transform a collection of text samples to a vector of token counts.
+
+### Constructor Parameters
+
+* $tokenizer (Tokenizer) - tokenizer object (see below)
+* $minDF (float) -  ignore tokens that have a samples frequency strictly lower than the given threshold. This value is also called cut-off in the literature. (default 0)
+
+```
+use Phpml\FeatureExtraction\TokenCountVectorizer;
+use Phpml\Tokenization\WhitespaceTokenizer;
+
+$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer());
+```
+
+### Transformation
+
+To transform a collection of text samples use `transform` method. Example:
+
+```
+$samples = [
+    'Lorem ipsum dolor sit amet dolor',
+    'Mauris placerat ipsum dolor',
+    'Mauris diam eros fringilla diam',
+];
+
+$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer());
+$vectorizer->transform($samples)
+// return $vector = [
+//    [0 => 1, 1 => 1, 2 => 2, 3 => 1, 4 => 1],
+//    [5 => 1, 6 => 1, 1 => 1, 2 => 1],
+//    [5 => 1, 7 => 2, 8 => 1, 9 => 1],
+//];
+        
+```
+
+### Vocabulary
+
+You can extract vocabulary using `getVocabulary()` method. Example:
+
+```
+$vectorizer->getVocabulary();
+// return $vocabulary = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', 'Mauris', 'placerat', 'diam', 'eros', 'fringilla'];
+```
+
+### Tokenizers
+
+* WhitespaceTokenizer - select tokens by whitespace.
+* WordTokenizer - select tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator).
--- a/docs/machine-learning/preprocessing/imputation-missing-values.md
+++ b/docs/machine-learning/preprocessing/imputation-missing-values.md
@ -0,0 +1,45 @@
+# Imputation missing values
+
+For various reasons, many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders.
+To solve this problem you can use the `Imputer` class.
+
+## Constructor Parameters
+
+* $missingValue (mixed) - this value will be replaced (default null)
+* $strategy (Strategy) - imputation strategy (read to use: MeanStrategy, MedianStrategy, MostFrequentStrategy)
+* $axis (int) - axis for strategy, Imputer::AXIS_COLUMN or Imputer::AXIS_ROW
+
+```
+$imputer = new Imputer(null, new MeanStrategy(), Imputer::AXIS_COLUMN);
+$imputer = new Imputer(null, new MedianStrategy(), Imputer::AXIS_ROW);
+```
+
+## Strategy
+
+* MeanStrategy - replace missing values using the mean along the axis
+* MedianStrategy - replace missing values using the median along the axis
+* MostFrequentStrategy - replace missing using the most frequent value along the axis
+
+## Example of use
+
+```
+$data = [
+    [1, null, 3, 4],
+    [4, 3, 2, 1],
+    [null, 6, 7, 8],
+    [8, 7, null, 5],
+];
+
+$imputer = new Imputer(null, new MeanStrategy(), Imputer::AXIS_COLUMN);
+$imputer->preprocess($data);
+
+/*
+$data = [
+    [1, 5.33, 3, 4],
+    [4, 3, 2, 1],
+    [4.33, 6, 7, 8],
+    [8, 7, 4, 5],
+];
+*/
+
+```
--- a/docs/machine-learning/preprocessing/normalization.md
+++ b/docs/machine-learning/preprocessing/normalization.md
@ -0,0 +1,59 @@
+# Normalization
+
+Normalization is the process of scaling individual samples to have unit norm.
+
+## L2 norm
+
+[http://mathworld.wolfram.com/L2-Norm.html](http://mathworld.wolfram.com/L2-Norm.html)
+
+Example:
+
+```
+use Phpml\Preprocessing\Normalizer;
+
+$samples = [
+    [1, -1, 2],
+    [2, 0, 0],
+    [0, 1, -1],
+];
+
+$normalizer = new Normalizer();
+$normalizer->preprocess($samples);
+
+/*
+$samples = [
+  [0.4, -0.4, 0.81],
+  [1.0, 0.0, 0.0],
+  [0.0, 0.7, -0.7],
+];
+*/
+
+```
+
+## L1 norm
+
+[http://mathworld.wolfram.com/L1-Norm.html](http://mathworld.wolfram.com/L1-Norm.html)
+
+Example:
+
+```
+use Phpml\Preprocessing\Normalizer;
+
+$samples = [
+    [1, -1, 2],
+    [2, 0, 0],
+    [0, 1, -1],
+];
+
+$normalizer = new Normalizer(Normalizer::NORM_L1);
+$normalizer->preprocess($samples);
+
+/*
+$samples = [
+   [0.25, -0.25, 0.5],
+   [1.0, 0.0, 0.0],
+   [0.0, 0.5, -0.5],
+];
+*/
+
+```
--- a/docs/machine-learning/regression/svr.md
+++ b/docs/machine-learning/regression/svr.md
@ -0,0 +1,44 @@
+# Support Vector Regression
+
+Class implementing Epsilon-Support Vector Regression based on libsvm.
+
+### Constructor Parameters
+
+* $kernel (int) - kernel type to be used in the algorithm (default Kernel::LINEAR)
+* $degree (int) - degree of the Kernel::POLYNOMIAL function (default 3)
+* $epsilon (float) -  epsilon in loss function of epsilon-SVR (default 0.1)
+* $cost (float) - parameter C of C-SVC (default 1.0)
+* $gamma (float) - kernel coefficient for ‘Kernel::RBF’, ‘Kernel::POLYNOMIAL’ and ‘Kernel::SIGMOID’. If gamma is ‘null’ then 1/features will be used instead.
+* $coef0 (float) - independent term in kernel function. It is only significant in ‘Kernel::POLYNOMIAL’ and ‘Kernel::SIGMOID’ (default 0.0)
+* $tolerance (float) - tolerance of termination criterion (default 0.001)
+* $cacheSize (int) - cache memory size in MB (default 100)
+* $shrinking (bool) - whether to use the shrinking heuristics (default true)
+
+```
+$regression = new SVR(Kernel::LINEAR);
+$regression = new SVR(Kernel::LINEAR, $degree = 3, $epsilon=10.0);
+```
+
+### Train
+
+To train a model simply provide train samples and targets values (as `array`). Example:
+
+```
+use Phpml\Regression\SVR;
+use Phpml\SupportVectorMachine\Kernel;
+
+$samples = [[60], [61], [62], [63], [65]];
+$targets = [3.1, 3.6, 3.8, 4, 4.1];
+
+$regression = new SVR(Kernel::LINEAR);
+$regression->train($samples, $targets);
+```
+
+### Predict
+
+To predict sample target value use `predict` method. You can provide one sample or array of samples:
+
+```
+$regression->predict([64])
+// return 4.03
+```
--- a/docs/math/statistic.md
+++ b/docs/math/statistic.md
@ -0,0 +1,80 @@
+# Statistic
+
+Selected statistical methods.
+
+## Correlation
+
+Correlation coefficients are used in statistics to measure how strong a relationship is between two variables. There are several types of correlation coefficient.
+
+### Pearson correlation
+ 
+Pearson’s correlation or Pearson correlation is a correlation coefficient commonly used in linear regression.
+
+Example:
+
+```
+use Phpml\Math\Statistic\Correlation;
+
+$x = [43, 21, 25, 42, 57, 59];
+$y = [99, 65, 79, 75, 87, 82];
+
+Correlation::pearson($x, $y);
+// return 0.549
+```
+
+## Mean
+
+### Arithmetic
+
+Example:
+
+```
+use Phpml\Math\Statistic\Mean;
+
+Mean::arithmetic([2, 5];
+// return 3.5
+
+Mean::arithmetic([0.5, 0.5, 1.5, 2.5, 3.5];
+// return 1.7
+```
+
+## Median
+
+Example:
+
+```
+use Phpml\Math\Statistic\Mean;
+
+Mean::median([5, 2, 6, 1, 3, 4]);
+// return 3.5
+
+Mean::median([5, 2, 6, 1, 3]);
+// return 3
+```
+
+## Mode
+
+Example:
+
+```
+use Phpml\Math\Statistic\Mean;
+
+Mean::mode([5, 2, 6, 1, 3, 4, 6, 6, 5]);
+// return 6
+```
+
+## Standard Deviation
+
+Example:
+
+```
+use Phpml\Math\Statistic\StandardDeviation;
+
+$population = [5, 6, 8, 9];
+StandardDeviation::population($population)
+// return 1.825
+
+$population = [7100, 15500, 4400, 4400, 5900, 4600, 8800, 2000, 2750, 2550,  960, 1025];
+StandardDeviation::population($population)
+// return 4079
+```
--- a/humbug.json.dist
+++ b/humbug.json.dist
@ -0,0 +1,11 @@
+{
+    "source": {
+        "directories": [
+            "src"
+        ]
+    },
+    "timeout": 10,
+    "logs": {
+        "text": "humbuglog.txt"
+    }
+}
--- a/mkdocs.yml
+++ b/mkdocs.yml
@ -1,17 +1,21 @@
-site_name: PHP Machine Learning (PHP-ML)
+site_name: PHP-ML - Machine Learning library for PHP
 pages:
  - Home: index.md
  - Machine Learning:
    - Classification:
+      - SVC: machine-learning/classification/svc.md
      - KNearestNeighbors: machine-learning/classification/k-nearest-neighbors.md
      - NaiveBayes: machine-learning/classification/naive-bayes.md
    - Regression:
      - LeastSquares: machine-learning/regression/least-squares.md
+      - SVR: machine-learning/regression/svr.md
    - Clustering:
      - KMeans: machine-learning/clustering/k-means.md
      - DBSCAN: machine-learning/clustering/dbscan.md
    - Cross Validation:
      - RandomSplit: machine-learning/cross-validation/random-split.md
+    - Feature Extraction:
+      - Token Count Vectorizer: machine-learning/feature-extraction/token-count-vectorizer.md
    - Datasets:
      - Array Dataset: machine-learning/datasets/array-dataset.md
      - CSV Dataset: machine-learning/datasets/csv-dataset.md
--- a/phpunit.xml
+++ b/phpunit.xml
@ -11,4 +11,11 @@
            <directory>tests/*</directory>
        </testsuite>
    </testsuites>
+
+    <filter>
+        <whitelist processUncoveredFilesFromWhitelist="true">
+            <directory suffix=".php">src</directory>
+        </whitelist>
+    </filter>
+
 </phpunit>
--- a/src/Phpml/Classification/KNearestNeighbors.php
+++ b/src/Phpml/Classification/KNearestNeighbors.php
@ -4,8 +4,8 @@ declare (strict_types = 1);

 namespace Phpml\Classification;

-use Phpml\Classification\Traits\Predictable;
-use Phpml\Classification\Traits\Trainable;
+use Phpml\Helper\Predictable;
+use Phpml\Helper\Trainable;
 use Phpml\Math\Distance;
 use Phpml\Math\Distance\Euclidean;

--- a/src/Phpml/Classification/NaiveBayes.php
+++ b/src/Phpml/Classification/NaiveBayes.php
@ -4,8 +4,8 @@ declare (strict_types = 1);

 namespace Phpml\Classification;

-use Phpml\Classification\Traits\Predictable;
-use Phpml\Classification\Traits\Trainable;
+use Phpml\Helper\Predictable;
+use Phpml\Helper\Trainable;

 class NaiveBayes implements Classifier
 {
--- a/src/Phpml/Classification/SVC.php
+++ b/src/Phpml/Classification/SVC.php
@ -0,0 +1,31 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace Phpml\Classification;
+
+use Phpml\SupportVectorMachine\Kernel;
+use Phpml\SupportVectorMachine\SupportVectorMachine;
+use Phpml\SupportVectorMachine\Type;
+
+class SVC extends SupportVectorMachine implements Classifier
+{
+    /**
+     * @param int        $kernel
+     * @param float      $cost
+     * @param int        $degree
+     * @param float|null $gamma
+     * @param float      $coef0
+     * @param float      $tolerance
+     * @param int        $cacheSize
+     * @param bool       $shrinking
+     * @param bool       $probabilityEstimates
+     */
+    public function __construct(
+        int $kernel = Kernel::LINEAR, float $cost = 1.0, int $degree = 3, float $gamma = null, float $coef0 = 0.0,
+        float $tolerance = 0.001, int $cacheSize = 100, bool $shrinking = true,
+        bool $probabilityEstimates = false
+    ) {
+        parent::__construct(Type::C_SVC, $kernel, $cost, 0.5, $degree, $gamma, $coef0, 0.1, $tolerance, $cacheSize, $shrinking, $probabilityEstimates);
+    }
+}
--- a/src/Phpml/Classification/SupportVectorMachine.php
+++ b/src/Phpml/Classification/SupportVectorMachine.php
@ -1,61 +0,0 @@
-<?php
-
-declare (strict_types = 1);
-
-namespace Phpml\Classification;
-
-use Phpml\Classification\Traits\Predictable;
-use Phpml\Classification\Traits\Trainable;
-use Phpml\Math\Kernel;
-
-class SupportVectorMachine implements Classifier
-{
-    use Trainable, Predictable;
-
-    /**
-     * @var Kernel
-     */
-    private $kernel;
-
-    /**
-     * @var float
-     */
-    private $C;
-
-    /**
-     * @var float
-     */
-    private $tolerance;
-
-    /**
-     * @var int
-     */
-    private $upperBound;
-
-    /**
-     * @param Kernel $kernel
-     * @param float  $C
-     * @param float  $tolerance
-     * @param int    $upperBound
-     */
-    public function __construct(Kernel $kernel = null, float $C = 1.0, float $tolerance = .001, int $upperBound = 100)
-    {
-        if (null === $kernel) {
-            $kernel = new Kernel\RBF($gamma = .001);
-        }
-
-        $this->kernel = $kernel;
-        $this->C = $C;
-        $this->tolerance = $tolerance;
-        $this->upperBound = $upperBound;
-    }
-
-    /**
-     * @param array $sample
-     *
-     * @return mixed
-     */
-    protected function predictSample(array $sample)
-    {
-    }
-}
--- a/src/Phpml/Clustering/KMeans/Point.php
+++ b/src/Phpml/Clustering/KMeans/Point.php
@ -49,7 +49,7 @@ class Point implements ArrayAccess
            $distance  += $difference * $difference;
        }

-        return $precise ? sqrt($distance) : $distance;
+        return $precise ? sqrt((float) $distance) : $distance;
    }

    /**
--- a/src/Phpml/Clustering/KMeans/Space.php
+++ b/src/Phpml/Clustering/KMeans/Space.php
@ -150,37 +150,11 @@ class Space extends SplObjectStorage
    {
        switch ($initMethod) {
            case KMeans::INIT_RANDOM:
-                list($min, $max) = $this->getBoundaries();
-                for ($n = 0; $n < $clustersNumber; ++$n) {
-                    $clusters[] = new Cluster($this, $this->getRandomPoint($min, $max)->getCoordinates());
-                }
+                $clusters = $this->initializeRandomClusters($clustersNumber);
                break;

            case KMeans::INIT_KMEANS_PLUS_PLUS:
-                $position = rand(1, count($this));
-                for ($i = 1, $this->rewind(); $i < $position && $this->valid(); $i++, $this->next());
-                $clusters[] = new Cluster($this, $this->current()->getCoordinates());
-
-                $distances = new SplObjectStorage();
-
-                for ($i = 1; $i < $clustersNumber; ++$i) {
-                    $sum = 0;
-                    foreach ($this as $point) {
-                        $distance = $point->getDistanceWith($point->getClosest($clusters));
-                        $sum += $distances[$point] = $distance;
-                    }
-
-                    $sum = rand(0, (int) $sum);
-                    foreach ($this as $point) {
-                        if (($sum -= $distances[$point]) > 0) {
-                            continue;
-                        }
-
-                        $clusters[] = new Cluster($this, $point->getCoordinates());
-                        break;
-                    }
-                }
-
+                $clusters = $this->initializeKMPPClusters($clustersNumber);
                break;
        }
        $clusters[0]->attachAll($this);
@ -230,4 +204,56 @@ class Space extends SplObjectStorage

        return $convergence;
    }
+
+    /**
+     * @param int $clustersNumber
+     *
+     * @return array
+     */
+    private function initializeRandomClusters(int $clustersNumber)
+    {
+        $clusters = [];
+        list($min, $max) = $this->getBoundaries();
+
+        for ($n = 0; $n < $clustersNumber; ++$n) {
+            $clusters[] = new Cluster($this, $this->getRandomPoint($min, $max)->getCoordinates());
+        }
+
+        return $clusters;
+    }
+
+    /**
+     * @param int $clustersNumber
+     *
+     * @return array
+     */
+    protected function initializeKMPPClusters(int $clustersNumber)
+    {
+        $clusters = [];
+        $position = rand(1, count($this));
+        for ($i = 1, $this->rewind(); $i < $position && $this->valid(); $i++, $this->next());
+        $clusters[] = new Cluster($this, $this->current()->getCoordinates());
+
+        $distances = new SplObjectStorage();
+
+        for ($i = 1; $i < $clustersNumber; ++$i) {
+            $sum = 0;
+            foreach ($this as $point) {
+                $distance = $point->getDistanceWith($point->getClosest($clusters));
+                $sum += $distances[$point] = $distance;
+            }
+
+            $sum = rand(0, (int) $sum);
+            foreach ($this as $point) {
+                if (($sum -= $distances[$point]) > 0) {
+                    continue;
+                }
+
+                $clusters[] = new Cluster($this, $point->getCoordinates());
+                break;
+            }
+        }
+
+        return $clusters;
+    }
 }
--- a/src/Phpml/Dataset/Dataset.php
+++ b/src/Phpml/Dataset/Dataset.php
@ -6,7 +6,6 @@ namespace Phpml\Dataset;

 interface Dataset
 {
-    const SOME = 'z';
    /**
     * @return array
     */
--- a/src/Phpml/Exception/NormalizerException.php
+++ b/src/Phpml/Exception/NormalizerException.php
@ -0,0 +1,16 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace Phpml\Exception;
+
+class NormalizerException extends \Exception
+{
+    /**
+     * @return NormalizerException
+     */
+    public static function unknownNorm()
+    {
+        return new self('Unknown norm supplied.');
+    }
+}
--- a/src/Phpml/FeatureExtraction/TokenCountVectorizer.php
+++ b/src/Phpml/FeatureExtraction/TokenCountVectorizer.php
@ -0,0 +1,163 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace Phpml\FeatureExtraction;
+
+use Phpml\Tokenization\Tokenizer;
+
+class TokenCountVectorizer implements Vectorizer
+{
+    /**
+     * @var Tokenizer
+     */
+    private $tokenizer;
+
+    /**
+     * @var float
+     */
+    private $minDF;
+
+    /**
+     * @var array
+     */
+    private $vocabulary;
+
+    /**
+     * @var array
+     */
+    private $frequencies;
+
+    /**
+     * @param Tokenizer $tokenizer
+     * @param float     $minDF
+     */
+    public function __construct(Tokenizer $tokenizer, float $minDF = 0)
+    {
+        $this->tokenizer = $tokenizer;
+        $this->minDF = $minDF;
+        $this->vocabulary = [];
+        $this->frequencies = [];
+    }
+
+    /**
+     * @param array $samples
+     *
+     * @return array
+     */
+    public function transform(array $samples): array
+    {
+        foreach ($samples as $index => $sample) {
+            $samples[$index] = $this->transformSample($sample);
+        }
+
+        $samples = $this->checkDocumentFrequency($samples);
+
+        return $samples;
+    }
+
+    /**
+     * @return array
+     */
+    public function getVocabulary()
+    {
+        return array_flip($this->vocabulary);
+    }
+
+    /**
+     * @param string $sample
+     *
+     * @return array
+     */
+    private function transformSample(string $sample)
+    {
+        $counts = [];
+        $tokens = $this->tokenizer->tokenize($sample);
+        foreach ($tokens as $token) {
+            $index = $this->getTokenIndex($token);
+            $this->updateFrequency($token);
+            if (!isset($counts[$index])) {
+                $counts[$index] = 0;
+            }
+
+            ++$counts[$index];
+        }
+
+        return $counts;
+    }
+
+    /**
+     * @param string $token
+     *
+     * @return mixed
+     */
+    private function getTokenIndex(string $token)
+    {
+        if (!isset($this->vocabulary[$token])) {
+            $this->vocabulary[$token] = count($this->vocabulary);
+        }
+
+        return $this->vocabulary[$token];
+    }
+
+    /**
+     * @param string $token
+     */
+    private function updateFrequency(string $token)
+    {
+        if (!isset($this->frequencies[$token])) {
+            $this->frequencies[$token] = 0;
+        }
+
+        ++$this->frequencies[$token];
+    }
+
+    /**
+     * @param array $samples
+     * 
+     * @return array
+     */
+    private function checkDocumentFrequency(array $samples)
+    {
+        if ($this->minDF > 0) {
+            $beyondMinimum = $this->getBeyondMinimumIndexes(count($samples));
+            foreach ($samples as $index => $sample) {
+                $samples[$index] = $this->unsetBeyondMinimum($sample, $beyondMinimum);
+            }
+        }
+
+        return $samples;
+    }
+
+    /**
+     * @param array $sample
+     * @param array $beyondMinimum
+     *
+     * @return array
+     */
+    private function unsetBeyondMinimum(array $sample, array $beyondMinimum)
+    {
+        foreach ($beyondMinimum as $index) {
+            unset($sample[$index]);
+        }
+
+        return $sample;
+    }
+
+    /**
+     * @param int $samplesCount
+     *
+     * @return array
+     */
+    private function getBeyondMinimumIndexes(int $samplesCount)
+    {
+        $indexes = [];
+        foreach ($this->frequencies as $token => $frequency) {
+            if (($frequency / $samplesCount) < $this->minDF) {
+                $indexes[] = $this->getTokenIndex($token);
+            }
+        }
+
+        return $indexes;
+    }
+}
--- a/src/Phpml/FeatureExtraction/Vectorizer.php
+++ b/src/Phpml/FeatureExtraction/Vectorizer.php
@ -0,0 +1,15 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace Phpml\FeatureExtraction;
+
+interface Vectorizer
+{
+    /**
+     * @param array $samples
+     *
+     * @return array
+     */
+    public function transform(array $samples): array;
+}
--- a/src/Phpml/Classification/Traits/Predictable.php
+++ b/src/Phpml/Classification/Traits/Predictable.php
@ -2,7 +2,7 @@

 declare (strict_types = 1);

-namespace Phpml\Classification\Traits;
+namespace Phpml\Helper;

 trait Predictable
 {
--- a/src/Phpml/Classification/Traits/Trainable.php
+++ b/src/Phpml/Classification/Traits/Trainable.php
@ -2,7 +2,7 @@

 declare (strict_types = 1);

-namespace Phpml\Classification\Traits;
+namespace Phpml\Helper;

 trait Trainable
 {
--- a/src/Phpml/Math/Distance/Euclidean.php
+++ b/src/Phpml/Math/Distance/Euclidean.php
@ -30,6 +30,6 @@ class Euclidean implements Distance
            $distance += pow($a[$i] - $b[$i], 2);
        }

-        return sqrt($distance);
+        return sqrt((float) $distance);
    }
 }
--- a/src/Phpml/Math/Matrix.php
+++ b/src/Phpml/Math/Matrix.php
@ -147,7 +147,7 @@ class Matrix
            for ($j = 0; $j < $this->columns; ++$j) {
                $subMatrix = $this->crossOut(0, $j);
                $minor = $this->matrix[0][$j] * $subMatrix->getDeterminant();
-                $determinant += fmod($j, 2) == 0 ? $minor : -$minor;
+                $determinant += fmod((float) $j, 2.0) == 0 ? $minor : -$minor;
            }
        }

@ -236,7 +236,7 @@ class Matrix
        for ($i = 0; $i < $this->rows; ++$i) {
            for ($j = 0; $j < $this->columns; ++$j) {
                $minor = $this->crossOut($i, $j)->getDeterminant();
-                $newMatrix[$i][$j] = fmod($i + $j, 2) == 0 ? $minor : -$minor;
+                $newMatrix[$i][$j] = fmod((float) ($i + $j), 2.0) == 0 ? $minor : -$minor;
            }
        }

--- a/src/Phpml/Math/Statistic/Correlation.php
+++ b/src/Phpml/Math/Statistic/Correlation.php
@ -38,7 +38,7 @@ class Correlation
            $b2 = $b2 + pow($b, 2);
        }

-        $corr = $axb / sqrt($a2 * $b2);
+        $corr = $axb / sqrt((float) ($a2 * $b2));

        return $corr;
    }
--- a/src/Phpml/Math/Statistic/Mean.php
+++ b/src/Phpml/Math/Statistic/Mean.php
@ -4,15 +4,72 @@ declare (strict_types = 1);

 namespace Phpml\Math\Statistic;

+use Phpml\Exception\InvalidArgumentException;
+
 class Mean
 {
    /**
-     * @param array $a
+     * @param array $numbers
     *
     * @return float
+     *
+     * @throws InvalidArgumentException
     */
-    public static function arithmetic(array $a)
+    public static function arithmetic(array $numbers)
    {
-        return array_sum($a) / count($a);
+        self::checkArrayLength($numbers);
+
+        return array_sum($numbers) / count($numbers);
+    }
+
+    /**
+     * @param array $numbers
+     *
+     * @return float|mixed
+     *
+     * @throws InvalidArgumentException
+     */
+    public static function median(array $numbers)
+    {
+        self::checkArrayLength($numbers);
+
+        $count = count($numbers);
+        $middleIndex = floor($count / 2);
+        sort($numbers, SORT_NUMERIC);
+        $median = $numbers[$middleIndex];
+
+        if (0 == $count % 2) {
+            $median = ($median + $numbers[$middleIndex - 1]) / 2;
+        }
+
+        return $median;
+    }
+
+    /**
+     * @param array $numbers
+     *
+     * @return mixed
+     *
+     * @throws InvalidArgumentException
+     */
+    public static function mode(array $numbers)
+    {
+        self::checkArrayLength($numbers);
+
+        $values = array_count_values($numbers);
+
+        return array_search(max($values), $values);
+    }
+
+    /**
+     * @param array $array
+     *
+     * @throws InvalidArgumentException
+     */
+    private static function checkArrayLength(array $array)
+    {
+        if (0 == count($array)) {
+            throw InvalidArgumentException::arrayCantBeEmpty();
+        }
    }
 }
--- a/src/Phpml/Math/Statistic/StandardDeviation.php
+++ b/src/Phpml/Math/Statistic/StandardDeviation.php
@ -39,6 +39,6 @@ class StandardDeviation
            --$n;
        }

-        return sqrt($carry / $n);
+        return sqrt((float) ($carry / $n));
    }
 }
--- a/src/Phpml/Preprocessing/Imputer.php
+++ b/src/Phpml/Preprocessing/Imputer.php
@ -0,0 +1,86 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace Phpml\Preprocessing;
+
+use Phpml\Preprocessing\Imputer\Strategy;
+
+class Imputer implements Preprocessor
+{
+    const AXIS_COLUMN = 0;
+    const AXIS_ROW = 1;
+
+    /**
+     * @var mixed
+     */
+    private $missingValue;
+
+    /**
+     * @var Strategy
+     */
+    private $strategy;
+
+    /**
+     * @var int
+     */
+    private $axis;
+
+    /**
+     * @param mixed    $missingValue
+     * @param Strategy $strategy
+     * @param int      $axis
+     */
+    public function __construct($missingValue = null, Strategy $strategy, int $axis = self::AXIS_COLUMN)
+    {
+        $this->missingValue = $missingValue;
+        $this->strategy = $strategy;
+        $this->axis = $axis;
+    }
+
+    /**
+     * @param array $samples
+     */
+    public function preprocess(array &$samples)
+    {
+        foreach ($samples as &$sample) {
+            $this->preprocessSample($sample, $samples);
+        }
+    }
+
+    /**
+     * @param array $sample
+     * @param array $samples
+     */
+    private function preprocessSample(array &$sample, array $samples)
+    {
+        foreach ($sample as $column => &$value) {
+            if ($value === $this->missingValue) {
+                $value = $this->strategy->replaceValue($this->getAxis($column, $sample, $samples));
+            }
+        }
+    }
+
+    /**
+     * @param int   $column
+     * @param array $currentSample
+     * @param array $samples
+     * 
+     * @return array
+     */
+    private function getAxis(int $column, array $currentSample, array $samples): array
+    {
+        if (self::AXIS_ROW === $this->axis) {
+            return array_diff($currentSample, [$this->missingValue]);
+        }
+
+        $axis = [];
+        foreach ($samples as $sample) {
+            if ($sample[$column] !== $this->missingValue) {
+                $axis[] = $sample[$column];
+            }
+        }
+
+        return $axis;
+    }
+}
--- a/src/Phpml/Preprocessing/Imputer/Strategy.php
+++ b/src/Phpml/Preprocessing/Imputer/Strategy.php
@ -0,0 +1,15 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace Phpml\Preprocessing\Imputer;
+
+interface Strategy
+{
+    /**
+     * @param array $currentAxis
+     *
+     * @return mixed
+     */
+    public function replaceValue(array $currentAxis);
+}
--- a/src/Phpml/Preprocessing/Imputer/Strategy/MeanStrategy.php
+++ b/src/Phpml/Preprocessing/Imputer/Strategy/MeanStrategy.php
@ -0,0 +1,21 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace Phpml\Preprocessing\Imputer\Strategy;
+
+use Phpml\Preprocessing\Imputer\Strategy;
+use Phpml\Math\Statistic\Mean;
+
+class MeanStrategy implements Strategy
+{
+    /**
+     * @param array $currentAxis
+     * 
+     * @return float
+     */
+    public function replaceValue(array $currentAxis)
+    {
+        return Mean::arithmetic($currentAxis);
+    }
+}
--- a/src/Phpml/Preprocessing/Imputer/Strategy/MedianStrategy.php
+++ b/src/Phpml/Preprocessing/Imputer/Strategy/MedianStrategy.php
@ -0,0 +1,21 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace Phpml\Preprocessing\Imputer\Strategy;
+
+use Phpml\Preprocessing\Imputer\Strategy;
+use Phpml\Math\Statistic\Mean;
+
+class MedianStrategy implements Strategy
+{
+    /**
+     * @param array $currentAxis
+     * 
+     * @return float
+     */
+    public function replaceValue(array $currentAxis)
+    {
+        return Mean::median($currentAxis);
+    }
+}
--- a/src/Phpml/Preprocessing/Imputer/Strategy/MostFrequentStrategy.php
+++ b/src/Phpml/Preprocessing/Imputer/Strategy/MostFrequentStrategy.php
@ -0,0 +1,21 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace Phpml\Preprocessing\Imputer\Strategy;
+
+use Phpml\Preprocessing\Imputer\Strategy;
+use Phpml\Math\Statistic\Mean;
+
+class MostFrequentStrategy implements Strategy
+{
+    /**
+     * @param array $currentAxis
+     * 
+     * @return float|mixed
+     */
+    public function replaceValue(array $currentAxis)
+    {
+        return Mean::mode($currentAxis);
+    }
+}
--- a/src/Phpml/Preprocessing/Normalizer.php
+++ b/src/Phpml/Preprocessing/Normalizer.php
@ -0,0 +1,83 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace Phpml\Preprocessing;
+
+use Phpml\Exception\NormalizerException;
+
+class Normalizer implements Preprocessor
+{
+    const NORM_L1 = 1;
+    const NORM_L2 = 2;
+
+    /**
+     * @var int
+     */
+    private $norm;
+
+    /**
+     * @param int $norm
+     *
+     * @throws NormalizerException
+     */
+    public function __construct(int $norm = self::NORM_L2)
+    {
+        if (!in_array($norm, [self::NORM_L1, self::NORM_L2])) {
+            throw NormalizerException::unknownNorm();
+        }
+
+        $this->norm = $norm;
+    }
+
+    /**
+     * @param array $samples
+     */
+    public function preprocess(array &$samples)
+    {
+        $method = sprintf('normalizeL%s', $this->norm);
+        foreach ($samples as &$sample) {
+            $this->$method($sample);
+        }
+    }
+
+    /**
+     * @param array $sample
+     */
+    private function normalizeL1(array &$sample)
+    {
+        $norm1 = 0;
+        foreach ($sample as $feature) {
+            $norm1 += abs($feature);
+        }
+
+        if (0 == $norm1) {
+            $count = count($sample);
+            $sample = array_fill(0, $count, 1.0 / $count);
+        } else {
+            foreach ($sample as &$feature) {
+                $feature = $feature / $norm1;
+            }
+        }
+    }
+
+    /**
+     * @param array $sample
+     */
+    private function normalizeL2(array &$sample)
+    {
+        $norm2 = 0;
+        foreach ($sample as $feature) {
+            $norm2 += $feature * $feature;
+        }
+        $norm2 = sqrt($norm2);
+
+        if (0 == $norm2) {
+            $sample = array_fill(0, count($sample), 1);
+        } else {
+            foreach ($sample as &$feature) {
+                $feature = $feature / $norm2;
+            }
+        }
+    }
+}
--- a/src/Phpml/Preprocessing/Preprocessor.php
+++ b/src/Phpml/Preprocessing/Preprocessor.php
@ -0,0 +1,13 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace Phpml\Preprocessing;
+
+interface Preprocessor
+{
+    /**
+     * @param array $samples
+     */
+    public function preprocess(array &$samples);
+}
--- a/src/Phpml/Regression/LeastSquares.php
+++ b/src/Phpml/Regression/LeastSquares.php
@ -4,10 +4,12 @@ declare (strict_types = 1);

 namespace Phpml\Regression;

+use Phpml\Helper\Predictable;
 use Phpml\Math\Matrix;

 class LeastSquares implements Regression
 {
+    use Predictable;
    /**
     * @var array
     */
@ -45,7 +47,7 @@ class LeastSquares implements Regression
     *
     * @return mixed
     */
-    public function predict($sample)
+    public function predictSample(array $sample)
    {
        $result = $this->intercept;
        foreach ($this->coefficients as $index => $coefficient) {
--- a/src/Phpml/Regression/Regression.php
+++ b/src/Phpml/Regression/Regression.php
@ -13,9 +13,9 @@ interface Regression
    public function train(array $samples, array $targets);

    /**
-     * @param float $sample
+     * @param array $samples
     *
     * @return mixed
     */
-    public function predict($sample);
+    public function predict(array $samples);
 }
--- a/src/Phpml/Regression/SVR.php
+++ b/src/Phpml/Regression/SVR.php
@ -0,0 +1,31 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace Phpml\Regression;
+
+use Phpml\SupportVectorMachine\Kernel;
+use Phpml\SupportVectorMachine\SupportVectorMachine;
+use Phpml\SupportVectorMachine\Type;
+
+class SVR extends SupportVectorMachine implements Regression
+{
+    /**
+     * @param int        $kernel
+     * @param int        $degree
+     * @param float      $epsilon
+     * @param float      $cost
+     * @param float|null $gamma
+     * @param float      $coef0
+     * @param float      $tolerance
+     * @param int        $cacheSize
+     * @param bool       $shrinking
+     */
+    public function __construct(
+        int $kernel = Kernel::RBF, int $degree = 3, float $epsilon = 0.1, float $cost = 1.0,
+        float $gamma = null, float $coef0 = 0.0,  float $tolerance = 0.001,
+        int $cacheSize = 100, bool $shrinking = true
+    ) {
+        parent::__construct(Type::EPSILON_SVR, $kernel, $cost, 0.5, $degree, $gamma, $coef0, $epsilon, $tolerance, $cacheSize, $shrinking, false);
+    }
+}
--- a/src/Phpml/SupportVectorMachine/DataTransformer.php
+++ b/src/Phpml/SupportVectorMachine/DataTransformer.php
@ -0,0 +1,101 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace Phpml\SupportVectorMachine;
+
+class DataTransformer
+{
+    /**
+     * @param array $samples
+     * @param array $labels
+     * @param bool  $targets
+     *
+     * @return string
+     */
+    public static function trainingSet(array $samples, array $labels, bool $targets = false): string
+    {
+        $set = '';
+        if (!$targets) {
+            $numericLabels = self::numericLabels($labels);
+        }
+
+        foreach ($labels as $index => $label) {
+            $set .= sprintf('%s %s %s', ($targets ? $label : $numericLabels[$label]), self::sampleRow($samples[$index]), PHP_EOL);
+        }
+
+        return $set;
+    }
+
+    /**
+     * @param array $samples
+     *
+     * @return string
+     */
+    public static function testSet(array $samples): string
+    {
+        if (!is_array($samples[0])) {
+            $samples = [$samples];
+        }
+
+        $set = '';
+        foreach ($samples as $sample) {
+            $set .= sprintf('0 %s %s', self::sampleRow($sample), PHP_EOL);
+        }
+
+        return $set;
+    }
+
+    /**
+     * @param string $rawPredictions
+     * @param array  $labels
+     *
+     * @return array
+     */
+    public static function predictions(string $rawPredictions, array $labels): array
+    {
+        $numericLabels = self::numericLabels($labels);
+        $results = [];
+        foreach (explode(PHP_EOL, $rawPredictions) as $result) {
+            if (strlen($result) > 0) {
+                $results[] = array_search($result, $numericLabels);
+            }
+        }
+
+        return $results;
+    }
+
+    /**
+     * @param array $labels
+     *
+     * @return array
+     */
+    public static function numericLabels(array $labels): array
+    {
+        $numericLabels = [];
+        foreach ($labels as $label) {
+            if (isset($numericLabels[$label])) {
+                continue;
+            }
+
+            $numericLabels[$label] = count($numericLabels);
+        }
+
+        return $numericLabels;
+    }
+
+    /**
+     * @param array $sample
+     *
+     * @return string
+     */
+    private static function sampleRow(array $sample): string
+    {
+        $row = [];
+        foreach ($sample as $index => $feature) {
+            $row[] = sprintf('%s:%s', $index + 1, $feature);
+        }
+
+        return implode(' ', $row);
+    }
+}
--- a/src/Phpml/SupportVectorMachine/Kernel.php
+++ b/src/Phpml/SupportVectorMachine/Kernel.php
@ -0,0 +1,28 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace Phpml\SupportVectorMachine;
+
+abstract class Kernel
+{
+    /**
+     * u'*v.
+     */
+    const LINEAR = 0;
+
+    /**
+     * (gamma*u'*v + coef0)^degree.
+     */
+    const POLYNOMIAL = 1;
+
+    /**
+     * exp(-gamma*|u-v|^2).
+     */
+    const RBF = 2;
+
+    /**
+     * tanh(gamma*u'*v + coef0).
+     */
+    const SIGMOID = 3;
+}
--- a/src/Phpml/SupportVectorMachine/SupportVectorMachine.php
+++ b/src/Phpml/SupportVectorMachine/SupportVectorMachine.php
@ -0,0 +1,230 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace Phpml\SupportVectorMachine;
+
+class SupportVectorMachine
+{
+    /**
+     * @var int
+     */
+    private $type;
+
+    /**
+     * @var int
+     */
+    private $kernel;
+
+    /**
+     * @var float
+     */
+    private $cost;
+
+    /**
+     * @var float
+     */
+    private $nu;
+
+    /**
+     * @var int
+     */
+    private $degree;
+
+    /**
+     * @var float
+     */
+    private $gamma;
+
+    /**
+     * @var float
+     */
+    private $coef0;
+
+    /**
+     * @var float
+     */
+    private $epsilon;
+
+    /**
+     * @var float
+     */
+    private $tolerance;
+
+    /**
+     * @var int
+     */
+    private $cacheSize;
+
+    /**
+     * @var bool
+     */
+    private $shrinking;
+
+    /**
+     * @var bool
+     */
+    private $probabilityEstimates;
+
+    /**
+     * @var string
+     */
+    private $binPath;
+
+    /**
+     * @var string
+     */
+    private $varPath;
+
+    /**
+     * @var string
+     */
+    private $model;
+
+    /**
+     * @var array
+     */
+    private $labels;
+
+    /**
+     * @param int        $type
+     * @param int        $kernel
+     * @param float      $cost
+     * @param float      $nu
+     * @param int        $degree
+     * @param float|null $gamma
+     * @param float      $coef0
+     * @param float      $epsilon
+     * @param float      $tolerance
+     * @param int        $cacheSize
+     * @param bool       $shrinking
+     * @param bool       $probabilityEstimates
+     */
+    public function __construct(
+        int $type, int $kernel, float $cost = 1.0, float $nu = 0.5, int $degree = 3,
+        float $gamma = null, float $coef0 = 0.0, float $epsilon = 0.1, float $tolerance = 0.001,
+        int $cacheSize = 100, bool $shrinking = true, bool $probabilityEstimates = false
+    ) {
+        $this->type = $type;
+        $this->kernel = $kernel;
+        $this->cost = $cost;
+        $this->nu = $nu;
+        $this->degree = $degree;
+        $this->gamma = $gamma;
+        $this->coef0 = $coef0;
+        $this->epsilon = $epsilon;
+        $this->tolerance = $tolerance;
+        $this->cacheSize = $cacheSize;
+        $this->shrinking = $shrinking;
+        $this->probabilityEstimates = $probabilityEstimates;
+
+        $rootPath = realpath(implode(DIRECTORY_SEPARATOR, [dirname(__FILE__), '..', '..', '..'])).DIRECTORY_SEPARATOR;
+
+        $this->binPath = $rootPath.'bin'.DIRECTORY_SEPARATOR.'libsvm'.DIRECTORY_SEPARATOR;
+        $this->varPath = $rootPath.'var'.DIRECTORY_SEPARATOR;
+    }
+
+    /**
+     * @param array $samples
+     * @param array $labels
+     */
+    public function train(array $samples, array $labels)
+    {
+        $this->labels = $labels;
+        $trainingSet = DataTransformer::trainingSet($samples, $labels, in_array($this->type, [Type::EPSILON_SVR, Type::NU_SVR]));
+        file_put_contents($trainingSetFileName = $this->varPath.uniqid(), $trainingSet);
+        $modelFileName = $trainingSetFileName.'-model';
+
+        $command = $this->buildTrainCommand($trainingSetFileName, $modelFileName);
+        $output = '';
+        exec(escapeshellcmd($command), $output);
+
+        $this->model = file_get_contents($modelFileName);
+
+        unlink($trainingSetFileName);
+        unlink($modelFileName);
+    }
+
+    /**
+     * @return string
+     */
+    public function getModel()
+    {
+        return $this->model;
+    }
+
+    /**
+     * @param array $samples
+     *
+     * @return array
+     */
+    public function predict(array $samples)
+    {
+        $testSet = DataTransformer::testSet($samples);
+        file_put_contents($testSetFileName = $this->varPath.uniqid(), $testSet);
+        file_put_contents($modelFileName = $testSetFileName.'-model', $this->model);
+        $outputFileName = $testSetFileName.'-output';
+
+        $command = sprintf('%ssvm-predict%s %s %s %s', $this->binPath, $this->getOSExtension(), $testSetFileName, $modelFileName, $outputFileName);
+        $output = '';
+        exec(escapeshellcmd($command), $output);
+
+        $predictions = file_get_contents($outputFileName);
+
+        unlink($testSetFileName);
+        unlink($modelFileName);
+        unlink($outputFileName);
+
+        if (in_array($this->type, [Type::C_SVC, Type::NU_SVC])) {
+            $predictions = DataTransformer::predictions($predictions, $this->labels);
+        } else {
+            $predictions = explode(PHP_EOL, trim($predictions));
+        }
+
+        if (!is_array($samples[0])) {
+            return $predictions[0];
+        }
+
+        return $predictions;
+    }
+
+    /**
+     * @return string
+     */
+    private function getOSExtension()
+    {
+        if (strtoupper(substr(PHP_OS, 0, 3)) === 'WIN') {
+            return '.exe';
+        }
+
+        return '';
+    }
+
+    /**
+     * @param $trainingSetFileName
+     * @param $modelFileName
+     *
+     * @return string
+     */
+    private function buildTrainCommand(string $trainingSetFileName, string $modelFileName): string
+    {
+        return sprintf('%ssvm-train%s -s %s -t %s -c %s -n %s -d %s%s -r %s -p %s -m %s -e %s -h %d -b %d \'%s\' \'%s\'',
+            $this->binPath,
+            $this->getOSExtension(),
+            $this->type,
+            $this->kernel,
+            $this->cost,
+            $this->nu,
+            $this->degree,
+            $this->gamma !== null ? ' -g '.$this->gamma : '',
+            $this->coef0,
+            $this->epsilon,
+            $this->cacheSize,
+            $this->tolerance,
+            $this->shrinking,
+            $this->probabilityEstimates,
+            $trainingSetFileName,
+            $modelFileName
+        );
+    }
+}
--- a/src/Phpml/SupportVectorMachine/Type.php
+++ b/src/Phpml/SupportVectorMachine/Type.php
@ -0,0 +1,33 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace Phpml\SupportVectorMachine;
+
+abstract class Type
+{
+    /**
+     * classification.
+     */
+    const C_SVC = 0;
+
+    /**
+     * classification.
+     */
+    const NU_SVC = 1;
+
+    /**
+     * distribution estimation.
+     */
+    const ONE_CLASS_SVM = 2;
+
+    /**
+     * regression.
+     */
+    const EPSILON_SVR = 3;
+
+    /**
+     * regression.
+     */
+    const NU_SVR = 4;
+}
--- a/src/Phpml/Tokenization/Tokenizer.php
+++ b/src/Phpml/Tokenization/Tokenizer.php
@ -0,0 +1,15 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace Phpml\Tokenization;
+
+interface Tokenizer
+{
+    /**
+     * @param string $text
+     * 
+     * @return array
+     */
+    public function tokenize(string $text): array;
+}
--- a/src/Phpml/Tokenization/WhitespaceTokenizer.php
+++ b/src/Phpml/Tokenization/WhitespaceTokenizer.php
@ -0,0 +1,18 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace Phpml\Tokenization;
+
+class WhitespaceTokenizer implements Tokenizer
+{
+    /**
+     * @param string $text
+     *
+     * @return array
+     */
+    public function tokenize(string $text): array
+    {
+        return preg_split('/[\pZ\pC]+/u', $text, -1, PREG_SPLIT_NO_EMPTY);
+    }
+}
--- a/src/Phpml/Tokenization/WordTokenizer.php
+++ b/src/Phpml/Tokenization/WordTokenizer.php
@ -0,0 +1,21 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace Phpml\Tokenization;
+
+class WordTokenizer implements Tokenizer
+{
+    /**
+     * @param string $text
+     *
+     * @return array
+     */
+    public function tokenize(string $text): array
+    {
+        $tokens = [];
+        preg_match_all('/\w\w+/u', $text, $tokens);
+
+        return $tokens[0];
+    }
+}
--- a/tests/Phpml/Classification/SVCTest.php
+++ b/tests/Phpml/Classification/SVCTest.php
@ -0,0 +1,45 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace tests\Classification;
+
+use Phpml\Classification\SVC;
+use Phpml\SupportVectorMachine\Kernel;
+
+class SVCTest extends \PHPUnit_Framework_TestCase
+{
+    public function testPredictSingleSampleWithLinearKernel()
+    {
+        $samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]];
+        $labels = ['a', 'a', 'a', 'b', 'b', 'b'];
+
+        $classifier = new SVC(Kernel::LINEAR, $cost = 1000);
+        $classifier->train($samples, $labels);
+
+        $this->assertEquals('b', $classifier->predict([3, 2]));
+        $this->assertEquals('b', $classifier->predict([5, 1]));
+        $this->assertEquals('b', $classifier->predict([4, 3]));
+        $this->assertEquals('b', $classifier->predict([4, -5]));
+
+        $this->assertEquals('a', $classifier->predict([2, 3]));
+        $this->assertEquals('a', $classifier->predict([1, 2]));
+        $this->assertEquals('a', $classifier->predict([1, 5]));
+        $this->assertEquals('a', $classifier->predict([3, 10]));
+    }
+
+    public function testPredictArrayOfSamplesWithLinearKernel()
+    {
+        $trainSamples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]];
+        $trainLabels = ['a', 'a', 'a', 'b', 'b', 'b'];
+
+        $testSamples = [[3, 2], [5, 1], [4, 3], [4, -5], [2, 3], [1, 2], [1, 5], [3, 10]];
+        $testLabels = ['b', 'b', 'b', 'b', 'a', 'a', 'a', 'a'];
+
+        $classifier = new SVC(Kernel::LINEAR, $cost = 1000);
+        $classifier->train($trainSamples, $trainLabels);
+        $predictions = $classifier->predict($testSamples);
+
+        $this->assertEquals($testLabels, $predictions);
+    }
+}
--- a/tests/Phpml/FeatureExtraction/TokenCountVectorizerTest.php
+++ b/tests/Phpml/FeatureExtraction/TokenCountVectorizerTest.php
@ -0,0 +1,73 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace tests\Phpml\FeatureExtraction;
+
+use Phpml\FeatureExtraction\TokenCountVectorizer;
+use Phpml\Tokenization\WhitespaceTokenizer;
+
+class TokenCountVectorizerTest extends \PHPUnit_Framework_TestCase
+{
+    public function testTokenCountVectorizerWithWhitespaceTokenizer()
+    {
+        $samples = [
+            'Lorem ipsum dolor sit amet dolor',
+            'Mauris placerat ipsum dolor',
+            'Mauris diam eros fringilla diam',
+        ];
+
+        $vocabulary = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', 'Mauris', 'placerat', 'diam', 'eros', 'fringilla'];
+        $vector = [
+            [0 => 1, 1 => 1, 2 => 2, 3 => 1, 4 => 1],
+            [5 => 1, 6 => 1, 1 => 1, 2 => 1],
+            [5 => 1, 7 => 2, 8 => 1, 9 => 1],
+        ];
+
+        $vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer());
+
+        $this->assertEquals($vector, $vectorizer->transform($samples));
+        $this->assertEquals($vocabulary, $vectorizer->getVocabulary());
+    }
+
+    public function testMinimumDocumentTokenCountFrequency()
+    {
+        // word at least in half samples
+        $samples = [
+            'Lorem ipsum dolor sit amet',
+            'Lorem ipsum sit amet',
+            'ipsum sit amet',
+            'ipsum sit amet',
+        ];
+
+        $vocabulary = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet'];
+        $vector = [
+            [0 => 1, 1 => 1, 3 => 1, 4 => 1],
+            [0 => 1, 1 => 1, 3 => 1, 4 => 1],
+            [1 => 1, 3 => 1, 4 => 1],
+            [1 => 1, 3 => 1, 4 => 1],
+        ];
+
+        $vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), 0.5);
+
+        $this->assertEquals($vector, $vectorizer->transform($samples));
+        $this->assertEquals($vocabulary, $vectorizer->getVocabulary());
+
+        // word at least in all samples
+        $samples = [
+            'Lorem ipsum dolor sit amet',
+            'Morbi quis lacinia arcu. Sed eu sagittis Lorem',
+            'Suspendisse gravida consequat eros Lorem',
+        ];
+
+        $vector = [
+            [0 => 1],
+            [0 => 1],
+            [0 => 1],
+        ];
+
+        $vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), 1);
+
+        $this->assertEquals($vector, $vectorizer->transform($samples));
+    }
+}
--- a/tests/Phpml/Math/Statistic/MeanTest.php
+++ b/tests/Phpml/Math/Statistic/MeanTest.php
@ -8,6 +8,14 @@ use Phpml\Math\Statistic\Mean;

 class MeanTest extends \PHPUnit_Framework_TestCase
 {
+    /**
+     * @expectedException \Phpml\Exception\InvalidArgumentException
+     */
+    public function testArithmeticThrowExceptionOnEmptyArray()
+    {
+        Mean::arithmetic([]);
+    }
+
    public function testArithmeticMean()
    {
        $delta = 0.01;
@ -15,4 +23,41 @@ class MeanTest extends \PHPUnit_Framework_TestCase
        $this->assertEquals(41.16, Mean::arithmetic([43, 21, 25, 42, 57, 59]), '', $delta);
        $this->assertEquals(1.7, Mean::arithmetic([0.5, 0.5, 1.5, 2.5, 3.5]), '', $delta);
    }
+
+    /**
+     * @expectedException \Phpml\Exception\InvalidArgumentException
+     */
+    public function testMedianThrowExceptionOnEmptyArray()
+    {
+        Mean::median([]);
+    }
+
+    public function testMedianOnOddLengthArray()
+    {
+        $numbers = [5, 2, 6, 1, 3];
+
+        $this->assertEquals(3, Mean::median($numbers));
+    }
+
+    public function testMedianOnEvenLengthArray()
+    {
+        $numbers = [5, 2, 6, 1, 3, 4];
+
+        $this->assertEquals(3.5, Mean::median($numbers));
+    }
+
+    /**
+     * @expectedException \Phpml\Exception\InvalidArgumentException
+     */
+    public function testModeThrowExceptionOnEmptyArray()
+    {
+        Mean::mode([]);
+    }
+
+    public function testModeOnArray()
+    {
+        $numbers = [5, 2, 6, 1, 3, 4, 6, 6, 5];
+
+        $this->assertEquals(6, Mean::mode($numbers));
+    }
 }
--- a/tests/Phpml/Preprocessing/ImputerTest.php
+++ b/tests/Phpml/Preprocessing/ImputerTest.php
@ -0,0 +1,149 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace tests\Preprocessing;
+
+use Phpml\Preprocessing\Imputer;
+use Phpml\Preprocessing\Imputer\Strategy\MeanStrategy;
+use Phpml\Preprocessing\Imputer\Strategy\MedianStrategy;
+use Phpml\Preprocessing\Imputer\Strategy\MostFrequentStrategy;
+
+class ImputerTest extends \PHPUnit_Framework_TestCase
+{
+    public function testComplementsMissingValuesWithMeanStrategyOnColumnAxis()
+    {
+        $data = [
+            [1, null, 3, 4],
+            [4, 3, 2, 1],
+            [null, 6, 7, 8],
+            [8, 7, null, 5],
+        ];
+
+        $imputeData = [
+            [1, 5.33, 3, 4],
+            [4, 3, 2, 1],
+            [4.33, 6, 7, 8],
+            [8, 7, 4, 5],
+        ];
+
+        $imputer = new Imputer(null, new MeanStrategy(), Imputer::AXIS_COLUMN);
+        $imputer->preprocess($data);
+
+        $this->assertEquals($imputeData, $data, '', $delta = 0.01);
+    }
+
+    public function testComplementsMissingValuesWithMeanStrategyOnRowAxis()
+    {
+        $data = [
+            [1, null, 3, 4],
+            [4, 3, 2, 1],
+            [null, 6, 7, 8],
+            [8, 7, null, 5],
+        ];
+
+        $imputeData = [
+            [1, 2.66, 3, 4],
+            [4, 3, 2, 1],
+            [7, 6, 7, 8],
+            [8, 7, 6.66, 5],
+        ];
+
+        $imputer = new Imputer(null, new MeanStrategy(), Imputer::AXIS_ROW);
+        $imputer->preprocess($data);
+
+        $this->assertEquals($imputeData, $data, '', $delta = 0.01);
+    }
+
+    public function testComplementsMissingValuesWithMediaStrategyOnColumnAxis()
+    {
+        $data = [
+            [1, null, 3, 4],
+            [4, 3, 2, 1],
+            [null, 6, 7, 8],
+            [8, 7, null, 5],
+        ];
+
+        $imputeData = [
+            [1, 6, 3, 4],
+            [4, 3, 2, 1],
+            [4, 6, 7, 8],
+            [8, 7, 3, 5],
+        ];
+
+        $imputer = new Imputer(null, new MedianStrategy(), Imputer::AXIS_COLUMN);
+        $imputer->preprocess($data);
+
+        $this->assertEquals($imputeData, $data, '', $delta = 0.01);
+    }
+
+    public function testComplementsMissingValuesWithMediaStrategyOnRowAxis()
+    {
+        $data = [
+            [1, null, 3, 4],
+            [4, 3, 2, 1],
+            [null, 6, 7, 8],
+            [8, 7, null, 5],
+        ];
+
+        $imputeData = [
+            [1, 3, 3, 4],
+            [4, 3, 2, 1],
+            [7, 6, 7, 8],
+            [8, 7, 7, 5],
+        ];
+
+        $imputer = new Imputer(null, new MedianStrategy(), Imputer::AXIS_ROW);
+        $imputer->preprocess($data);
+
+        $this->assertEquals($imputeData, $data, '', $delta = 0.01);
+    }
+
+    public function testComplementsMissingValuesWithMostFrequentStrategyOnColumnAxis()
+    {
+        $data = [
+            [1, null, 3, 4],
+            [4, 3, 2, 1],
+            [null, 6, 7, 8],
+            [8, 7, null, 5],
+            [8, 3, 2, 5],
+        ];
+
+        $imputeData = [
+            [1, 3, 3, 4],
+            [4, 3, 2, 1],
+            [8, 6, 7, 8],
+            [8, 7, 2, 5],
+            [8, 3, 2, 5],
+        ];
+
+        $imputer = new Imputer(null, new MostFrequentStrategy(), Imputer::AXIS_COLUMN);
+        $imputer->preprocess($data);
+
+        $this->assertEquals($imputeData, $data);
+    }
+
+    public function testComplementsMissingValuesWithMostFrequentStrategyOnRowAxis()
+    {
+        $data = [
+            [1, null, 3, 4, 3],
+            [4, 3, 2, 1, 7],
+            [null, 6, 7, 8, 6],
+            [8, 7, null, 5, 5],
+            [8, 3, 2, 5, 4],
+        ];
+
+        $imputeData = [
+            [1, 3, 3, 4, 3],
+            [4, 3, 2, 1, 7],
+            [6, 6, 7, 8, 6],
+            [8, 7, 5, 5, 5],
+            [8, 3, 2, 5, 4],
+        ];
+
+        $imputer = new Imputer(null, new MostFrequentStrategy(), Imputer::AXIS_ROW);
+        $imputer->preprocess($data);
+
+        $this->assertEquals($imputeData, $data);
+    }
+}
--- a/tests/Phpml/Preprocessing/NormalizerTest.php
+++ b/tests/Phpml/Preprocessing/NormalizerTest.php
@ -0,0 +1,58 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace tests\Preprocessing;
+
+use Phpml\Preprocessing\Normalizer;
+
+class NormalizerTest extends \PHPUnit_Framework_TestCase
+{
+    /**
+     * @expectedException \Phpml\Exception\NormalizerException
+     */
+    public function testThrowExceptionOnInvalidNorm()
+    {
+        new Normalizer(99);
+    }
+
+    public function testNormalizeSamplesWithL2Norm()
+    {
+        $samples = [
+            [1, -1, 2],
+            [2, 0, 0],
+            [0, 1, -1],
+        ];
+
+        $normalized = [
+            [0.4, -0.4, 0.81],
+            [1.0, 0.0, 0.0],
+            [0.0, 0.7, -0.7],
+        ];
+
+        $normalizer = new Normalizer();
+        $normalizer->preprocess($samples);
+
+        $this->assertEquals($normalized, $samples, '', $delta = 0.01);
+    }
+
+    public function testNormalizeSamplesWithL1Norm()
+    {
+        $samples = [
+            [1, -1, 2],
+            [2, 0, 0],
+            [0, 1, -1],
+        ];
+
+        $normalized = [
+            [0.25, -0.25, 0.5],
+            [1.0, 0.0, 0.0],
+            [0.0, 0.5, -0.5],
+        ];
+
+        $normalizer = new Normalizer(Normalizer::NORM_L1);
+        $normalizer->preprocess($samples);
+
+        $this->assertEquals($normalized, $samples, '', $delta = 0.01);
+    }
+}
--- a/tests/Phpml/Regression/SVRTest.php
+++ b/tests/Phpml/Regression/SVRTest.php
@ -0,0 +1,37 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace tests\Regression;
+
+use Phpml\Regression\SVR;
+use Phpml\SupportVectorMachine\Kernel;
+
+class SVRTest extends \PHPUnit_Framework_TestCase
+{
+    public function testPredictSingleFeatureSamples()
+    {
+        $delta = 0.01;
+
+        $samples = [[60], [61], [62], [63], [65]];
+        $targets = [3.1, 3.6, 3.8, 4, 4.1];
+
+        $regression = new SVR(Kernel::LINEAR);
+        $regression->train($samples, $targets);
+
+        $this->assertEquals(4.03, $regression->predict([64]), '', $delta);
+    }
+
+    public function testPredictMultiFeaturesSamples()
+    {
+        $delta = 0.01;
+
+        $samples = [[73676, 1996], [77006, 1998], [10565, 2000], [146088, 1995], [15000, 2001], [65940, 2000], [9300, 2000], [93739, 1996], [153260, 1994], [17764, 2002], [57000, 1998], [15000, 2000]];
+        $targets = [2000, 2750, 15500, 960, 4400, 8800, 7100, 2550, 1025, 5900, 4600, 4400];
+
+        $regression = new SVR(Kernel::LINEAR);
+        $regression->train($samples, $targets);
+
+        $this->assertEquals([4109.82, 4112.28], $regression->predict([[60000, 1996], [60000, 2000]]), '', $delta);
+    }
+}
--- a/tests/Phpml/SupportVectorMachine/DataTransformerTest.php
+++ b/tests/Phpml/SupportVectorMachine/DataTransformerTest.php
@ -0,0 +1,39 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace tests\SupportVectorMachine;
+
+use Phpml\SupportVectorMachine\DataTransformer;
+
+class DataTransformerTest extends \PHPUnit_Framework_TestCase
+{
+    public function testTransformDatasetToTrainingSet()
+    {
+        $samples = [[1, 1], [2, 1], [3, 2], [4, 5]];
+        $labels = ['a', 'a', 'b', 'b'];
+
+        $trainingSet =
+            '0 1:1 2:1 '.PHP_EOL.
+            '0 1:2 2:1 '.PHP_EOL.
+            '1 1:3 2:2 '.PHP_EOL.
+            '1 1:4 2:5 '.PHP_EOL
+        ;
+
+        $this->assertEquals($trainingSet, DataTransformer::trainingSet($samples, $labels));
+    }
+
+    public function testTransformSamplesToTestSet()
+    {
+        $samples = [[1, 1], [2, 1], [3, 2], [4, 5]];
+
+        $testSet =
+            '0 1:1 2:1 '.PHP_EOL.
+            '0 1:2 2:1 '.PHP_EOL.
+            '0 1:3 2:2 '.PHP_EOL.
+            '0 1:4 2:5 '.PHP_EOL
+        ;
+
+        $this->assertEquals($testSet, DataTransformer::testSet($samples));
+    }
+}
--- a/tests/Phpml/SupportVectorMachine/SupportVectorMachineTest.php
+++ b/tests/Phpml/SupportVectorMachine/SupportVectorMachineTest.php
@ -0,0 +1,82 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace tests\SupportVectorMachine;
+
+use Phpml\SupportVectorMachine\Kernel;
+use Phpml\SupportVectorMachine\SupportVectorMachine;
+use Phpml\SupportVectorMachine\Type;
+
+class SupportVectorMachineTest extends \PHPUnit_Framework_TestCase
+{
+    public function testTrainCSVCModelWithLinearKernel()
+    {
+        $samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]];
+        $labels = ['a', 'a', 'a', 'b', 'b', 'b'];
+
+        $model =
+            'svm_type c_svc
+kernel_type linear
+nr_class 2
+total_sv 2
+rho 0
+label 0 1
+nr_sv 1 1
+SV
+0.25 1:2 2:4 
+-0.25 1:4 2:2 
+';
+
+        $svm = new SupportVectorMachine(Type::C_SVC, Kernel::LINEAR, 100.0);
+        $svm->train($samples, $labels);
+
+        $this->assertEquals($model, $svm->getModel());
+    }
+
+    public function testPredictSampleWithLinearKernel()
+    {
+        $samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]];
+        $labels = ['a', 'a', 'a', 'b', 'b', 'b'];
+
+        $svm = new SupportVectorMachine(Type::C_SVC, Kernel::LINEAR, 100.0);
+        $svm->train($samples, $labels);
+
+        $predictions = $svm->predict([
+            [3, 2],
+            [2, 3],
+            [4, -5],
+        ]);
+
+        $this->assertEquals('b', $predictions[0]);
+        $this->assertEquals('a', $predictions[1]);
+        $this->assertEquals('b', $predictions[2]);
+    }
+
+    public function testPredictSampleFromMultipleClassWithRbfKernel()
+    {
+        $samples = [
+            [1, 3], [1, 4], [1, 4],
+            [3, 1], [4, 1], [4, 2],
+            [-3, -1], [-4, -1], [-4, -2],
+        ];
+        $labels = [
+            'a', 'a', 'a',
+            'b', 'b', 'b',
+            'c', 'c', 'c',
+        ];
+
+        $svm = new SupportVectorMachine(Type::C_SVC, Kernel::RBF, 100.0);
+        $svm->train($samples, $labels);
+
+        $predictions = $svm->predict([
+            [1, 5],
+            [4, 3],
+            [-4, -3],
+        ]);
+
+        $this->assertEquals('a', $predictions[0]);
+        $this->assertEquals('b', $predictions[1]);
+        $this->assertEquals('c', $predictions[2]);
+    }
+}
--- a/tests/Phpml/Tokenization/WhitespaceTokenizerTest.php
+++ b/tests/Phpml/Tokenization/WhitespaceTokenizerTest.php
@ -0,0 +1,40 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace tests\Tokenization;
+
+use Phpml\Tokenization\WhitespaceTokenizer;
+
+class WhitespaceTokenizerTest extends \PHPUnit_Framework_TestCase
+{
+    public function testTokenizationOnAscii()
+    {
+        $tokenizer = new WhitespaceTokenizer();
+
+        $text = 'Lorem ipsum dolor sit amet, consectetur   adipiscing elit.
+                 Cras consectetur, dui et lobortis auctor. 
+                 Nulla vitae  congue lorem.';
+
+        $tokens = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet,', 'consectetur', 'adipiscing', 'elit.',
+                 'Cras', 'consectetur,', 'dui', 'et', 'lobortis', 'auctor.',
+                 'Nulla', 'vitae', 'congue', 'lorem.', ];
+
+        $this->assertEquals($tokens, $tokenizer->tokenize($text));
+    }
+
+    public function testTokenizationOnUtf8()
+    {
+        $tokenizer = new WhitespaceTokenizer();
+
+        $text = '鋍鞎 鳼 鞮鞢騉 袟袘觕, 炟砏 蒮 謺貙蹖 偢偣唲 蒛 箷箯緷 鑴鱱爧 覮轀,
+                 剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢 銪 餀 枲柊氠 鍎鞚韕 焲犈,
+                 殍涾烰 齞齝囃 蹅輶 鄜, 孻憵 擙樲橚 藒襓謥 岯岪弨 蒮 廞徲 孻憵懥 趡趛踠 槏';
+
+        $tokens = ['鋍鞎', '鳼', '鞮鞢騉', '袟袘觕,', '炟砏', '蒮', '謺貙蹖', '偢偣唲', '蒛', '箷箯緷', '鑴鱱爧', '覮轀,',
+                  '剆坲', '煘煓瑐', '鬐鶤鶐', '飹勫嫢', '銪', '餀', '枲柊氠', '鍎鞚韕', '焲犈,',
+                  '殍涾烰', '齞齝囃', '蹅輶', '鄜,', '孻憵', '擙樲橚', '藒襓謥', '岯岪弨', '蒮', '廞徲', '孻憵懥', '趡趛踠', '槏', ];
+
+        $this->assertEquals($tokens, $tokenizer->tokenize($text));
+    }
+}
--- a/tests/Phpml/Tokenization/WordTokenizerTest.php
+++ b/tests/Phpml/Tokenization/WordTokenizerTest.php
@ -0,0 +1,40 @@
+<?php
+
+declare (strict_types = 1);
+
+namespace tests\Tokenization;
+
+use Phpml\Tokenization\WordTokenizer;
+
+class WordTokenizerTest extends \PHPUnit_Framework_TestCase
+{
+    public function testTokenizationOnAscii()
+    {
+        $tokenizer = new WordTokenizer();
+
+        $text = 'Lorem ipsum-dolor sit amet, consectetur/adipiscing elit.
+                 Cras consectetur, dui et lobortis;auctor. 
+                 Nulla vitae ,.,/ congue lorem.';
+
+        $tokens = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', 'consectetur', 'adipiscing', 'elit',
+                 'Cras', 'consectetur', 'dui', 'et', 'lobortis', 'auctor',
+                 'Nulla', 'vitae', 'congue', 'lorem', ];
+
+        $this->assertEquals($tokens, $tokenizer->tokenize($text));
+    }
+
+    public function testTokenizationOnUtf8()
+    {
+        $tokenizer = new WordTokenizer();
+
+        $text = '鋍鞎 鳼 鞮鞢騉 袟袘觕, 炟砏 蒮 謺貙蹖 偢偣唲 蒛 箷箯緷 鑴鱱爧 覮轀,
+                 剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢 銪 餀 枲柊氠 鍎鞚韕 焲犈,
+                 殍涾烰 齞齝囃 蹅輶 鄜, 孻憵 擙樲橚 藒襓謥 岯岪弨 蒮 廞徲 孻憵懥 趡趛踠 槏';
+
+        $tokens = ['鋍鞎', '鞮鞢騉', '袟袘觕', '炟砏', '謺貙蹖', '偢偣唲', '箷箯緷', '鑴鱱爧', '覮轀',
+                  '剆坲', '煘煓瑐', '鬐鶤鶐', '飹勫嫢', '枲柊氠', '鍎鞚韕', '焲犈',
+                  '殍涾烰', '齞齝囃', '蹅輶', '孻憵', '擙樲橚', '藒襓謥', '岯岪弨', '廞徲', '孻憵懥', '趡趛踠', ];
+
+        $this->assertEquals($tokens, $tokenizer->tokenize($text));
+    }
+}
--- a/var/.gitkeep
+++ b/var/.gitkeep