Merge pull request #5 from php-ai/develop

SVM and Preprocessing tools
This commit is contained in:
Arkadiusz Kondas 2016-05-14 21:39:23 +02:00
commit 2393b9f137
68 changed files with 2184 additions and 143 deletions

2
.gitignore vendored
View File

@ -1 +1,3 @@
/vendor/
humbuglog.*
/bin/phpunit

View File

@ -1,4 +1,4 @@
# PHP Machine Learning library
# PHP-ML - Machine Learning library for PHP
[![Build Status](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/build.png?b=develop)](https://scrutinizer-ci.com/g/php-ai/php-ml/build-status/develop)
[![Documentation Status](https://readthedocs.org/projects/php-ml/badge/?version=develop)](http://php-ml.readthedocs.org/en/develop/?badge=develop)
@ -37,22 +37,32 @@ composer require php-ai/php-ml
## Features
* Classification
* [SVC](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/svc/)
* [k-Nearest Neighbors](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/k-nearest-neighbors/)
* [Naive Bayes](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/naive-bayes/)
* Regression
* [Least Squares](http://php-ml.readthedocs.io/en/latest/machine-learning/regression/least-squares/)
* [SVR](http://php-ml.readthedocs.io/en/latest/machine-learning/regression/svr/)
* Clustering
* [k-Means](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/k-means)
* [DBSCAN](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/dbscan)
* Cross Validation
* [Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/random-split)
* Preprocessing
* [Normalization](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/normalization)
* [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values)
* Feature Extraction
* [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer)
* Datasets
* [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset)
* Ready to use:
* [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/)
* [Wine](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/wine/)
* [Glass](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/glass/)
* Math
* [Distance](http://php-ml.readthedocs.io/en/latest/math/distance/)
* [Matrix](http://php-ml.readthedocs.io/en/latest/math/matrix/)
* [Statistic](http://php-ml.readthedocs.io/en/latest/math/statistic/)
## Contribute

BIN
bin/libsvm/svm-predict Executable file

Binary file not shown.

BIN
bin/libsvm/svm-predict.exe Normal file

Binary file not shown.

BIN
bin/libsvm/svm-scale Executable file

Binary file not shown.

BIN
bin/libsvm/svm-scale.exe Normal file

Binary file not shown.

BIN
bin/libsvm/svm-train Executable file

Binary file not shown.

BIN
bin/libsvm/svm-train.exe Normal file

Binary file not shown.

View File

@ -1,7 +1,7 @@
{
"name": "php-ai/php-ml",
"type": "library",
"description": "PHP Machine Learning library",
"description": "PHP-ML - Machine Learning library for PHP",
"license": "MIT",
"keywords": ["machine learning","pattern recognition","computational learning theory","artificial intelligence"],
"homepage": "https://github.com/php-ai/php-ml",
@ -16,13 +16,13 @@
"Phpml": "src/"
}
},
"config": {
"bin-dir": "bin"
},
"require": {
"php": ">=7.0.0"
},
"require-dev": {
"phpunit/phpunit": "^5.2"
},
"config": {
"bin-dir": "bin"
}
}

56
composer.lock generated
View File

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
"This file is @generated automatically"
],
"hash": "7c34eebd6b8749a1cd09df57e5d1f47a",
"hash": "f3e2d9975d300b3ea4c3568de44d8499",
"content-hash": "087091d0c339e9fa3a551a189ea658bf",
"packages": [],
"packages-dev": [
@ -64,16 +64,16 @@
},
{
"name": "myclabs/deep-copy",
"version": "1.5.0",
"version": "1.5.1",
"source": {
"type": "git",
"url": "https://github.com/myclabs/DeepCopy.git",
"reference": "e3abefcd7f106677fd352cd7c187d6c969aa9ddc"
"reference": "a8773992b362b58498eed24bf85005f363c34771"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/myclabs/DeepCopy/zipball/e3abefcd7f106677fd352cd7c187d6c969aa9ddc",
"reference": "e3abefcd7f106677fd352cd7c187d6c969aa9ddc",
"url": "https://api.github.com/repos/myclabs/DeepCopy/zipball/a8773992b362b58498eed24bf85005f363c34771",
"reference": "a8773992b362b58498eed24bf85005f363c34771",
"shasum": ""
},
"require": {
@ -102,7 +102,7 @@
"object",
"object graph"
],
"time": "2015-11-07 22:20:37"
"time": "2015-11-20 12:04:31"
},
{
"name": "phpdocumentor/reflection-docblock",
@ -217,16 +217,16 @@
},
{
"name": "phpunit/php-code-coverage",
"version": "3.3.0",
"version": "3.3.1",
"source": {
"type": "git",
"url": "https://github.com/sebastianbergmann/php-code-coverage.git",
"reference": "fe33716763b604ade4cb442c0794f5bd5ad73004"
"reference": "2431befdd451fac43fbcde94d1a92fb3b8b68f86"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/sebastianbergmann/php-code-coverage/zipball/fe33716763b604ade4cb442c0794f5bd5ad73004",
"reference": "fe33716763b604ade4cb442c0794f5bd5ad73004",
"url": "https://api.github.com/repos/sebastianbergmann/php-code-coverage/zipball/2431befdd451fac43fbcde94d1a92fb3b8b68f86",
"reference": "2431befdd451fac43fbcde94d1a92fb3b8b68f86",
"shasum": ""
},
"require": {
@ -244,7 +244,7 @@
},
"suggest": {
"ext-dom": "*",
"ext-xdebug": ">=2.2.1",
"ext-xdebug": ">=2.4.0",
"ext-xmlwriter": "*"
},
"type": "library",
@ -276,7 +276,7 @@
"testing",
"xunit"
],
"time": "2016-03-03 08:49:08"
"time": "2016-04-08 08:14:53"
},
{
"name": "phpunit/php-file-iterator",
@ -458,16 +458,16 @@
},
{
"name": "phpunit/phpunit",
"version": "5.3.1",
"version": "5.3.2",
"source": {
"type": "git",
"url": "https://github.com/sebastianbergmann/phpunit.git",
"reference": "34a3acb401ae79deb37bc6e5f5ec3d325b369b4c"
"reference": "2c6da3536035617bae3fe3db37283c9e0eb63ab3"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/sebastianbergmann/phpunit/zipball/34a3acb401ae79deb37bc6e5f5ec3d325b369b4c",
"reference": "34a3acb401ae79deb37bc6e5f5ec3d325b369b4c",
"url": "https://api.github.com/repos/sebastianbergmann/phpunit/zipball/2c6da3536035617bae3fe3db37283c9e0eb63ab3",
"reference": "2c6da3536035617bae3fe3db37283c9e0eb63ab3",
"shasum": ""
},
"require": {
@ -529,20 +529,20 @@
"testing",
"xunit"
],
"time": "2016-04-07 07:04:34"
"time": "2016-04-12 16:20:08"
},
{
"name": "phpunit/phpunit-mock-objects",
"version": "3.1.2",
"version": "3.1.3",
"source": {
"type": "git",
"url": "https://github.com/sebastianbergmann/phpunit-mock-objects.git",
"reference": "7c34c9bdde4131b824086457a3145e27dba10ca1"
"reference": "151c96874bff6fe61a25039df60e776613a61489"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/sebastianbergmann/phpunit-mock-objects/zipball/7c34c9bdde4131b824086457a3145e27dba10ca1",
"reference": "7c34c9bdde4131b824086457a3145e27dba10ca1",
"url": "https://api.github.com/repos/sebastianbergmann/phpunit-mock-objects/zipball/151c96874bff6fe61a25039df60e776613a61489",
"reference": "151c96874bff6fe61a25039df60e776613a61489",
"shasum": ""
},
"require": {
@ -585,7 +585,7 @@
"mock",
"xunit"
],
"time": "2016-03-24 05:58:25"
"time": "2016-04-20 14:39:26"
},
{
"name": "sebastian/code-unit-reverse-lookup",
@ -750,16 +750,16 @@
},
{
"name": "sebastian/environment",
"version": "1.3.5",
"version": "1.3.6",
"source": {
"type": "git",
"url": "https://github.com/sebastianbergmann/environment.git",
"reference": "dc7a29032cf72b54f36dac15a1ca5b3a1b6029bf"
"reference": "2292b116f43c272ff4328083096114f84ea46a56"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/sebastianbergmann/environment/zipball/dc7a29032cf72b54f36dac15a1ca5b3a1b6029bf",
"reference": "dc7a29032cf72b54f36dac15a1ca5b3a1b6029bf",
"url": "https://api.github.com/repos/sebastianbergmann/environment/zipball/2292b116f43c272ff4328083096114f84ea46a56",
"reference": "2292b116f43c272ff4328083096114f84ea46a56",
"shasum": ""
},
"require": {
@ -796,7 +796,7 @@
"environment",
"hhvm"
],
"time": "2016-02-26 18:40:46"
"time": "2016-05-04 07:59:13"
},
{
"name": "sebastian/exporter",
@ -1101,7 +1101,7 @@
},
{
"name": "symfony/yaml",
"version": "v3.0.4",
"version": "v3.0.5",
"source": {
"type": "git",
"url": "https://github.com/symfony/yaml.git",

View File

@ -1,4 +1,4 @@
# PHP Machine Learning library
# PHP-ML - Machine Learning library for PHP
[![Build Status](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/build.png?b=develop)](https://scrutinizer-ci.com/g/php-ai/php-ml/build-status/develop)
[![Documentation Status](https://readthedocs.org/projects/php-ml/badge/?version=develop)](http://php-ml.readthedocs.org/en/develop/?badge=develop)
@ -37,22 +37,32 @@ composer require php-ai/php-ml
## Features
* Classification
* [SVC](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/svc/)
* [k-Nearest Neighbors](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/k-nearest-neighbors/)
* [Naive Bayes](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/naive-bayes/)
* Regression
* [Least Squares](http://php-ml.readthedocs.io/en/latest/machine-learning/regression/least-squares/)
* [SVR](http://php-ml.readthedocs.io/en/latest/machine-learning/regression/svr/)
* Clustering
* [k-Means](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/k-means)
* [DBSCAN](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/dbscan)
* Cross Validation
* [Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/random-split)
* Preprocessing
* [Normalization](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/normalization)
* [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values)
* Feature Extraction
* [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer)
* Datasets
* [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset)
* Ready to use:
* [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/)
* [Wine](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/wine/)
* [Glass](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/glass/)
* Math
* [Distance](http://php-ml.readthedocs.io/en/latest/math/distance/)
* [Matrix](http://php-ml.readthedocs.io/en/latest/math/matrix/)
* [Statistic](http://php-ml.readthedocs.io/en/latest/math/statistic/)
## Contribute

View File

@ -0,0 +1,47 @@
# Support Vector Classification
Classifier implementing Support Vector Machine based on libsvm.
### Constructor Parameters
* $kernel (int) - kernel type to be used in the algorithm (default Kernel::LINEAR)
* $cost (float) - parameter C of C-SVC (default 1.0)
* $degree (int) - degree of the Kernel::POLYNOMIAL function (default 3)
* $gamma (float) - kernel coefficient for Kernel::RBF, Kernel::POLYNOMIAL and Kernel::SIGMOID. If gamma is null then 1/features will be used instead.
* $coef0 (float) - independent term in kernel function. It is only significant in Kernel::POLYNOMIAL and Kernel::SIGMOID (default 0.0)
* $tolerance (float) - tolerance of termination criterion (default 0.001)
* $cacheSize (int) - cache memory size in MB (default 100)
* $shrinking (bool) - whether to use the shrinking heuristics (default true)
* $probabilityEstimates (bool) - whether to enable probability estimates (default false)
```
$classifier = new SVC(Kernel::LINEAR, $cost = 1000);
$classifier = new SVC(Kernel::RBF, $cost = 1000, $degree = 3, $gamma = 6);
```
### Train
To train a classifier simply provide train samples and labels (as `array`). Example:
```
use Phpml\Classification\SVC;
use Phpml\SupportVectorMachine\Kernel;
$samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]];
$labels = ['a', 'a', 'a', 'b', 'b', 'b'];
$classifier = new SVC(Kernel::LINEAR, $cost = 1000);
$classifier->train($samples, $labels);
```
### Predict
To predict sample label use `predict` method. You can provide one sample or array of samples:
```
$classifier->predict([3, 2]);
// return 'b'
$classifier->predict([[3, 2], [1, 5]]);
// return ['b', 'a']
```

View File

@ -0,0 +1,42 @@
# Glass Dataset
From USA Forensic Science Service; 6 types of glass; defined in terms of their oxide content (i.e. Na, Fe, K, etc)
### Specification
| Classes | 6 |
| Samples total | 214 |
| Features per sample | 9 |
Samples per class:
* 70 float processed building windows
* 17 float processed vehicle windows
* 76 non-float processed building windows
* 13 containers
* 9 tableware
* 29 headlamps
### Load
To load Glass dataset simple use:
```
use Phpml\Dataset\Demo\Glass;
$dataset = new Glass();
```
### Several samples example
```
RI: refractive index,Na: Sodium,Mg: Magnesium,Al: Aluminum,Si: Silicon,K: Potassium,Ca: Calcium,Ba: Barium,Fe: Iron,type of glass
1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.00,building_windows_float_processed
1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.00,building_windows_float_processed
1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.00,building_windows_float_processed
1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.00,building_windows_float_processed
1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.00,building_windows_float_processed
1.51596,12.79,3.61,1.62,72.97,0.64,8.07,0.00,0.26,building_windows_float_processed
1.51743,13.30,3.60,1.14,73.09,0.58,8.17,0.00,0.00,building_windows_float_processed
1.51756,13.15,3.61,1.05,73.24,0.57,8.24,0.00,0.00,building_windows_float_processed
1.51918,14.04,3.58,1.37,72.08,0.56,8.30,0.00,0.00,building_windows_float_processed
```

View File

@ -14,6 +14,8 @@ Most popular and widely available dataset of iris flower measurement and class n
To load Iris dataset simple use:
```
use Phpml\Dataset\Demo\Iris;
$dataset = new Iris();
```

View File

@ -0,0 +1,35 @@
# Wine Dataset
These data are the results of a chemical analysis of wines grown in the same region in Italy but derived from three different cultivars. The analysis determined the quantities of 13 constituents found in each of the three types of wines.
### Specification
| Classes | 3 |
| Samples per class | class 1 59; class 2 71; class 3 48 |
| Samples total | 178 |
| Features per sample | 13 |
### Load
To load Wine dataset simple use:
```
use Phpml\Dataset\Demo\Wine;
$dataset = new Wine();
```
### Several samples example
```
alcohol,malic acid,ash,alcalinity of ash,magnesium,total phenols,flavanoids,nonflavanoid phenols,proanthocyanins,color intensity,hue,OD280/OD315 of diluted wines,proline,class
14.23,1.71,2.43,15.6,127,2.8,3.06,.28,2.29,5.64,1.04,3.92,1065,1
13.2,1.78,2.14,11.2,100,2.65,2.76,.26,1.28,4.38,1.05,3.4,1050,1
13.16,2.36,2.67,18.6,101,2.8,3.24,.3,2.81,5.68,1.03,3.17,1185,1
14.37,1.95,2.5,16.8,113,3.85,3.49,.24,2.18,7.8,.86,3.45,1480,1
13.24,2.59,2.87,21,118,2.8,2.69,.39,1.82,4.32,1.04,2.93,735,1
14.2,1.76,2.45,15.2,112,3.27,3.39,.34,1.97,6.75,1.05,2.85,1450,1
14.39,1.87,2.45,14.6,96,2.5,2.52,.3,1.98,5.25,1.02,3.58,1290,1
14.06,2.15,2.61,17.6,121,2.6,2.51,.31,1.25,5.05,1.06,3.58,1295,1
14.83,1.64,2.17,14,97,2.8,2.98,.29,1.98,5.2,1.08,2.85,1045,1
```

View File

@ -0,0 +1,50 @@
# Token Count Vectorizer
Transform a collection of text samples to a vector of token counts.
### Constructor Parameters
* $tokenizer (Tokenizer) - tokenizer object (see below)
* $minDF (float) - ignore tokens that have a samples frequency strictly lower than the given threshold. This value is also called cut-off in the literature. (default 0)
```
use Phpml\FeatureExtraction\TokenCountVectorizer;
use Phpml\Tokenization\WhitespaceTokenizer;
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer());
```
### Transformation
To transform a collection of text samples use `transform` method. Example:
```
$samples = [
'Lorem ipsum dolor sit amet dolor',
'Mauris placerat ipsum dolor',
'Mauris diam eros fringilla diam',
];
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer());
$vectorizer->transform($samples)
// return $vector = [
// [0 => 1, 1 => 1, 2 => 2, 3 => 1, 4 => 1],
// [5 => 1, 6 => 1, 1 => 1, 2 => 1],
// [5 => 1, 7 => 2, 8 => 1, 9 => 1],
//];
```
### Vocabulary
You can extract vocabulary using `getVocabulary()` method. Example:
```
$vectorizer->getVocabulary();
// return $vocabulary = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', 'Mauris', 'placerat', 'diam', 'eros', 'fringilla'];
```
### Tokenizers
* WhitespaceTokenizer - select tokens by whitespace.
* WordTokenizer - select tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator).

View File

@ -0,0 +1,45 @@
# Imputation missing values
For various reasons, many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders.
To solve this problem you can use the `Imputer` class.
## Constructor Parameters
* $missingValue (mixed) - this value will be replaced (default null)
* $strategy (Strategy) - imputation strategy (read to use: MeanStrategy, MedianStrategy, MostFrequentStrategy)
* $axis (int) - axis for strategy, Imputer::AXIS_COLUMN or Imputer::AXIS_ROW
```
$imputer = new Imputer(null, new MeanStrategy(), Imputer::AXIS_COLUMN);
$imputer = new Imputer(null, new MedianStrategy(), Imputer::AXIS_ROW);
```
## Strategy
* MeanStrategy - replace missing values using the mean along the axis
* MedianStrategy - replace missing values using the median along the axis
* MostFrequentStrategy - replace missing using the most frequent value along the axis
## Example of use
```
$data = [
[1, null, 3, 4],
[4, 3, 2, 1],
[null, 6, 7, 8],
[8, 7, null, 5],
];
$imputer = new Imputer(null, new MeanStrategy(), Imputer::AXIS_COLUMN);
$imputer->preprocess($data);
/*
$data = [
[1, 5.33, 3, 4],
[4, 3, 2, 1],
[4.33, 6, 7, 8],
[8, 7, 4, 5],
];
*/
```

View File

@ -0,0 +1,59 @@
# Normalization
Normalization is the process of scaling individual samples to have unit norm.
## L2 norm
[http://mathworld.wolfram.com/L2-Norm.html](http://mathworld.wolfram.com/L2-Norm.html)
Example:
```
use Phpml\Preprocessing\Normalizer;
$samples = [
[1, -1, 2],
[2, 0, 0],
[0, 1, -1],
];
$normalizer = new Normalizer();
$normalizer->preprocess($samples);
/*
$samples = [
[0.4, -0.4, 0.81],
[1.0, 0.0, 0.0],
[0.0, 0.7, -0.7],
];
*/
```
## L1 norm
[http://mathworld.wolfram.com/L1-Norm.html](http://mathworld.wolfram.com/L1-Norm.html)
Example:
```
use Phpml\Preprocessing\Normalizer;
$samples = [
[1, -1, 2],
[2, 0, 0],
[0, 1, -1],
];
$normalizer = new Normalizer(Normalizer::NORM_L1);
$normalizer->preprocess($samples);
/*
$samples = [
[0.25, -0.25, 0.5],
[1.0, 0.0, 0.0],
[0.0, 0.5, -0.5],
];
*/
```

View File

@ -0,0 +1,44 @@
# Support Vector Regression
Class implementing Epsilon-Support Vector Regression based on libsvm.
### Constructor Parameters
* $kernel (int) - kernel type to be used in the algorithm (default Kernel::LINEAR)
* $degree (int) - degree of the Kernel::POLYNOMIAL function (default 3)
* $epsilon (float) - epsilon in loss function of epsilon-SVR (default 0.1)
* $cost (float) - parameter C of C-SVC (default 1.0)
* $gamma (float) - kernel coefficient for Kernel::RBF, Kernel::POLYNOMIAL and Kernel::SIGMOID. If gamma is null then 1/features will be used instead.
* $coef0 (float) - independent term in kernel function. It is only significant in Kernel::POLYNOMIAL and Kernel::SIGMOID (default 0.0)
* $tolerance (float) - tolerance of termination criterion (default 0.001)
* $cacheSize (int) - cache memory size in MB (default 100)
* $shrinking (bool) - whether to use the shrinking heuristics (default true)
```
$regression = new SVR(Kernel::LINEAR);
$regression = new SVR(Kernel::LINEAR, $degree = 3, $epsilon=10.0);
```
### Train
To train a model simply provide train samples and targets values (as `array`). Example:
```
use Phpml\Regression\SVR;
use Phpml\SupportVectorMachine\Kernel;
$samples = [[60], [61], [62], [63], [65]];
$targets = [3.1, 3.6, 3.8, 4, 4.1];
$regression = new SVR(Kernel::LINEAR);
$regression->train($samples, $targets);
```
### Predict
To predict sample target value use `predict` method. You can provide one sample or array of samples:
```
$regression->predict([64])
// return 4.03
```

80
docs/math/statistic.md Normal file
View File

@ -0,0 +1,80 @@
# Statistic
Selected statistical methods.
## Correlation
Correlation coefficients are used in statistics to measure how strong a relationship is between two variables. There are several types of correlation coefficient.
### Pearson correlation
Pearsons correlation or Pearson correlation is a correlation coefficient commonly used in linear regression.
Example:
```
use Phpml\Math\Statistic\Correlation;
$x = [43, 21, 25, 42, 57, 59];
$y = [99, 65, 79, 75, 87, 82];
Correlation::pearson($x, $y);
// return 0.549
```
## Mean
### Arithmetic
Example:
```
use Phpml\Math\Statistic\Mean;
Mean::arithmetic([2, 5];
// return 3.5
Mean::arithmetic([0.5, 0.5, 1.5, 2.5, 3.5];
// return 1.7
```
## Median
Example:
```
use Phpml\Math\Statistic\Mean;
Mean::median([5, 2, 6, 1, 3, 4]);
// return 3.5
Mean::median([5, 2, 6, 1, 3]);
// return 3
```
## Mode
Example:
```
use Phpml\Math\Statistic\Mean;
Mean::mode([5, 2, 6, 1, 3, 4, 6, 6, 5]);
// return 6
```
## Standard Deviation
Example:
```
use Phpml\Math\Statistic\StandardDeviation;
$population = [5, 6, 8, 9];
StandardDeviation::population($population)
// return 1.825
$population = [7100, 15500, 4400, 4400, 5900, 4600, 8800, 2000, 2750, 2550, 960, 1025];
StandardDeviation::population($population)
// return 4079
```

11
humbug.json.dist Normal file
View File

@ -0,0 +1,11 @@
{
"source": {
"directories": [
"src"
]
},
"timeout": 10,
"logs": {
"text": "humbuglog.txt"
}
}

View File

@ -1,17 +1,21 @@
site_name: PHP Machine Learning (PHP-ML)
site_name: PHP-ML - Machine Learning library for PHP
pages:
- Home: index.md
- Machine Learning:
- Classification:
- SVC: machine-learning/classification/svc.md
- KNearestNeighbors: machine-learning/classification/k-nearest-neighbors.md
- NaiveBayes: machine-learning/classification/naive-bayes.md
- Regression:
- LeastSquares: machine-learning/regression/least-squares.md
- SVR: machine-learning/regression/svr.md
- Clustering:
- KMeans: machine-learning/clustering/k-means.md
- DBSCAN: machine-learning/clustering/dbscan.md
- Cross Validation:
- RandomSplit: machine-learning/cross-validation/random-split.md
- Feature Extraction:
- Token Count Vectorizer: machine-learning/feature-extraction/token-count-vectorizer.md
- Datasets:
- Array Dataset: machine-learning/datasets/array-dataset.md
- CSV Dataset: machine-learning/datasets/csv-dataset.md

View File

@ -11,4 +11,11 @@
<directory>tests/*</directory>
</testsuite>
</testsuites>
<filter>
<whitelist processUncoveredFilesFromWhitelist="true">
<directory suffix=".php">src</directory>
</whitelist>
</filter>
</phpunit>

View File

@ -4,8 +4,8 @@ declare (strict_types = 1);
namespace Phpml\Classification;
use Phpml\Classification\Traits\Predictable;
use Phpml\Classification\Traits\Trainable;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
use Phpml\Math\Distance;
use Phpml\Math\Distance\Euclidean;

View File

@ -4,8 +4,8 @@ declare (strict_types = 1);
namespace Phpml\Classification;
use Phpml\Classification\Traits\Predictable;
use Phpml\Classification\Traits\Trainable;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
class NaiveBayes implements Classifier
{

View File

@ -0,0 +1,31 @@
<?php
declare (strict_types = 1);
namespace Phpml\Classification;
use Phpml\SupportVectorMachine\Kernel;
use Phpml\SupportVectorMachine\SupportVectorMachine;
use Phpml\SupportVectorMachine\Type;
class SVC extends SupportVectorMachine implements Classifier
{
/**
* @param int $kernel
* @param float $cost
* @param int $degree
* @param float|null $gamma
* @param float $coef0
* @param float $tolerance
* @param int $cacheSize
* @param bool $shrinking
* @param bool $probabilityEstimates
*/
public function __construct(
int $kernel = Kernel::LINEAR, float $cost = 1.0, int $degree = 3, float $gamma = null, float $coef0 = 0.0,
float $tolerance = 0.001, int $cacheSize = 100, bool $shrinking = true,
bool $probabilityEstimates = false
) {
parent::__construct(Type::C_SVC, $kernel, $cost, 0.5, $degree, $gamma, $coef0, 0.1, $tolerance, $cacheSize, $shrinking, $probabilityEstimates);
}
}

View File

@ -1,61 +0,0 @@
<?php
declare (strict_types = 1);
namespace Phpml\Classification;
use Phpml\Classification\Traits\Predictable;
use Phpml\Classification\Traits\Trainable;
use Phpml\Math\Kernel;
class SupportVectorMachine implements Classifier
{
use Trainable, Predictable;
/**
* @var Kernel
*/
private $kernel;
/**
* @var float
*/
private $C;
/**
* @var float
*/
private $tolerance;
/**
* @var int
*/
private $upperBound;
/**
* @param Kernel $kernel
* @param float $C
* @param float $tolerance
* @param int $upperBound
*/
public function __construct(Kernel $kernel = null, float $C = 1.0, float $tolerance = .001, int $upperBound = 100)
{
if (null === $kernel) {
$kernel = new Kernel\RBF($gamma = .001);
}
$this->kernel = $kernel;
$this->C = $C;
$this->tolerance = $tolerance;
$this->upperBound = $upperBound;
}
/**
* @param array $sample
*
* @return mixed
*/
protected function predictSample(array $sample)
{
}
}

View File

@ -49,7 +49,7 @@ class Point implements ArrayAccess
$distance += $difference * $difference;
}
return $precise ? sqrt($distance) : $distance;
return $precise ? sqrt((float) $distance) : $distance;
}
/**

View File

@ -150,37 +150,11 @@ class Space extends SplObjectStorage
{
switch ($initMethod) {
case KMeans::INIT_RANDOM:
list($min, $max) = $this->getBoundaries();
for ($n = 0; $n < $clustersNumber; ++$n) {
$clusters[] = new Cluster($this, $this->getRandomPoint($min, $max)->getCoordinates());
}
$clusters = $this->initializeRandomClusters($clustersNumber);
break;
case KMeans::INIT_KMEANS_PLUS_PLUS:
$position = rand(1, count($this));
for ($i = 1, $this->rewind(); $i < $position && $this->valid(); $i++, $this->next());
$clusters[] = new Cluster($this, $this->current()->getCoordinates());
$distances = new SplObjectStorage();
for ($i = 1; $i < $clustersNumber; ++$i) {
$sum = 0;
foreach ($this as $point) {
$distance = $point->getDistanceWith($point->getClosest($clusters));
$sum += $distances[$point] = $distance;
}
$sum = rand(0, (int) $sum);
foreach ($this as $point) {
if (($sum -= $distances[$point]) > 0) {
continue;
}
$clusters[] = new Cluster($this, $point->getCoordinates());
break;
}
}
$clusters = $this->initializeKMPPClusters($clustersNumber);
break;
}
$clusters[0]->attachAll($this);
@ -230,4 +204,56 @@ class Space extends SplObjectStorage
return $convergence;
}
/**
* @param int $clustersNumber
*
* @return array
*/
private function initializeRandomClusters(int $clustersNumber)
{
$clusters = [];
list($min, $max) = $this->getBoundaries();
for ($n = 0; $n < $clustersNumber; ++$n) {
$clusters[] = new Cluster($this, $this->getRandomPoint($min, $max)->getCoordinates());
}
return $clusters;
}
/**
* @param int $clustersNumber
*
* @return array
*/
protected function initializeKMPPClusters(int $clustersNumber)
{
$clusters = [];
$position = rand(1, count($this));
for ($i = 1, $this->rewind(); $i < $position && $this->valid(); $i++, $this->next());
$clusters[] = new Cluster($this, $this->current()->getCoordinates());
$distances = new SplObjectStorage();
for ($i = 1; $i < $clustersNumber; ++$i) {
$sum = 0;
foreach ($this as $point) {
$distance = $point->getDistanceWith($point->getClosest($clusters));
$sum += $distances[$point] = $distance;
}
$sum = rand(0, (int) $sum);
foreach ($this as $point) {
if (($sum -= $distances[$point]) > 0) {
continue;
}
$clusters[] = new Cluster($this, $point->getCoordinates());
break;
}
}
return $clusters;
}
}

View File

@ -6,7 +6,6 @@ namespace Phpml\Dataset;
interface Dataset
{
const SOME = 'z';
/**
* @return array
*/

View File

@ -0,0 +1,16 @@
<?php
declare (strict_types = 1);
namespace Phpml\Exception;
class NormalizerException extends \Exception
{
/**
* @return NormalizerException
*/
public static function unknownNorm()
{
return new self('Unknown norm supplied.');
}
}

View File

@ -0,0 +1,163 @@
<?php
declare (strict_types = 1);
namespace Phpml\FeatureExtraction;
use Phpml\Tokenization\Tokenizer;
class TokenCountVectorizer implements Vectorizer
{
/**
* @var Tokenizer
*/
private $tokenizer;
/**
* @var float
*/
private $minDF;
/**
* @var array
*/
private $vocabulary;
/**
* @var array
*/
private $frequencies;
/**
* @param Tokenizer $tokenizer
* @param float $minDF
*/
public function __construct(Tokenizer $tokenizer, float $minDF = 0)
{
$this->tokenizer = $tokenizer;
$this->minDF = $minDF;
$this->vocabulary = [];
$this->frequencies = [];
}
/**
* @param array $samples
*
* @return array
*/
public function transform(array $samples): array
{
foreach ($samples as $index => $sample) {
$samples[$index] = $this->transformSample($sample);
}
$samples = $this->checkDocumentFrequency($samples);
return $samples;
}
/**
* @return array
*/
public function getVocabulary()
{
return array_flip($this->vocabulary);
}
/**
* @param string $sample
*
* @return array
*/
private function transformSample(string $sample)
{
$counts = [];
$tokens = $this->tokenizer->tokenize($sample);
foreach ($tokens as $token) {
$index = $this->getTokenIndex($token);
$this->updateFrequency($token);
if (!isset($counts[$index])) {
$counts[$index] = 0;
}
++$counts[$index];
}
return $counts;
}
/**
* @param string $token
*
* @return mixed
*/
private function getTokenIndex(string $token)
{
if (!isset($this->vocabulary[$token])) {
$this->vocabulary[$token] = count($this->vocabulary);
}
return $this->vocabulary[$token];
}
/**
* @param string $token
*/
private function updateFrequency(string $token)
{
if (!isset($this->frequencies[$token])) {
$this->frequencies[$token] = 0;
}
++$this->frequencies[$token];
}
/**
* @param array $samples
*
* @return array
*/
private function checkDocumentFrequency(array $samples)
{
if ($this->minDF > 0) {
$beyondMinimum = $this->getBeyondMinimumIndexes(count($samples));
foreach ($samples as $index => $sample) {
$samples[$index] = $this->unsetBeyondMinimum($sample, $beyondMinimum);
}
}
return $samples;
}
/**
* @param array $sample
* @param array $beyondMinimum
*
* @return array
*/
private function unsetBeyondMinimum(array $sample, array $beyondMinimum)
{
foreach ($beyondMinimum as $index) {
unset($sample[$index]);
}
return $sample;
}
/**
* @param int $samplesCount
*
* @return array
*/
private function getBeyondMinimumIndexes(int $samplesCount)
{
$indexes = [];
foreach ($this->frequencies as $token => $frequency) {
if (($frequency / $samplesCount) < $this->minDF) {
$indexes[] = $this->getTokenIndex($token);
}
}
return $indexes;
}
}

View File

@ -0,0 +1,15 @@
<?php
declare (strict_types = 1);
namespace Phpml\FeatureExtraction;
interface Vectorizer
{
/**
* @param array $samples
*
* @return array
*/
public function transform(array $samples): array;
}

View File

@ -2,7 +2,7 @@
declare (strict_types = 1);
namespace Phpml\Classification\Traits;
namespace Phpml\Helper;
trait Predictable
{

View File

@ -2,7 +2,7 @@
declare (strict_types = 1);
namespace Phpml\Classification\Traits;
namespace Phpml\Helper;
trait Trainable
{

View File

@ -30,6 +30,6 @@ class Euclidean implements Distance
$distance += pow($a[$i] - $b[$i], 2);
}
return sqrt($distance);
return sqrt((float) $distance);
}
}

View File

@ -147,7 +147,7 @@ class Matrix
for ($j = 0; $j < $this->columns; ++$j) {
$subMatrix = $this->crossOut(0, $j);
$minor = $this->matrix[0][$j] * $subMatrix->getDeterminant();
$determinant += fmod($j, 2) == 0 ? $minor : -$minor;
$determinant += fmod((float) $j, 2.0) == 0 ? $minor : -$minor;
}
}
@ -236,7 +236,7 @@ class Matrix
for ($i = 0; $i < $this->rows; ++$i) {
for ($j = 0; $j < $this->columns; ++$j) {
$minor = $this->crossOut($i, $j)->getDeterminant();
$newMatrix[$i][$j] = fmod($i + $j, 2) == 0 ? $minor : -$minor;
$newMatrix[$i][$j] = fmod((float) ($i + $j), 2.0) == 0 ? $minor : -$minor;
}
}

View File

@ -38,7 +38,7 @@ class Correlation
$b2 = $b2 + pow($b, 2);
}
$corr = $axb / sqrt($a2 * $b2);
$corr = $axb / sqrt((float) ($a2 * $b2));
return $corr;
}

View File

@ -4,15 +4,72 @@ declare (strict_types = 1);
namespace Phpml\Math\Statistic;
use Phpml\Exception\InvalidArgumentException;
class Mean
{
/**
* @param array $a
* @param array $numbers
*
* @return float
*
* @throws InvalidArgumentException
*/
public static function arithmetic(array $a)
public static function arithmetic(array $numbers)
{
return array_sum($a) / count($a);
self::checkArrayLength($numbers);
return array_sum($numbers) / count($numbers);
}
/**
* @param array $numbers
*
* @return float|mixed
*
* @throws InvalidArgumentException
*/
public static function median(array $numbers)
{
self::checkArrayLength($numbers);
$count = count($numbers);
$middleIndex = floor($count / 2);
sort($numbers, SORT_NUMERIC);
$median = $numbers[$middleIndex];
if (0 == $count % 2) {
$median = ($median + $numbers[$middleIndex - 1]) / 2;
}
return $median;
}
/**
* @param array $numbers
*
* @return mixed
*
* @throws InvalidArgumentException
*/
public static function mode(array $numbers)
{
self::checkArrayLength($numbers);
$values = array_count_values($numbers);
return array_search(max($values), $values);
}
/**
* @param array $array
*
* @throws InvalidArgumentException
*/
private static function checkArrayLength(array $array)
{
if (0 == count($array)) {
throw InvalidArgumentException::arrayCantBeEmpty();
}
}
}

View File

@ -39,6 +39,6 @@ class StandardDeviation
--$n;
}
return sqrt($carry / $n);
return sqrt((float) ($carry / $n));
}
}

View File

@ -0,0 +1,86 @@
<?php
declare (strict_types = 1);
namespace Phpml\Preprocessing;
use Phpml\Preprocessing\Imputer\Strategy;
class Imputer implements Preprocessor
{
const AXIS_COLUMN = 0;
const AXIS_ROW = 1;
/**
* @var mixed
*/
private $missingValue;
/**
* @var Strategy
*/
private $strategy;
/**
* @var int
*/
private $axis;
/**
* @param mixed $missingValue
* @param Strategy $strategy
* @param int $axis
*/
public function __construct($missingValue = null, Strategy $strategy, int $axis = self::AXIS_COLUMN)
{
$this->missingValue = $missingValue;
$this->strategy = $strategy;
$this->axis = $axis;
}
/**
* @param array $samples
*/
public function preprocess(array &$samples)
{
foreach ($samples as &$sample) {
$this->preprocessSample($sample, $samples);
}
}
/**
* @param array $sample
* @param array $samples
*/
private function preprocessSample(array &$sample, array $samples)
{
foreach ($sample as $column => &$value) {
if ($value === $this->missingValue) {
$value = $this->strategy->replaceValue($this->getAxis($column, $sample, $samples));
}
}
}
/**
* @param int $column
* @param array $currentSample
* @param array $samples
*
* @return array
*/
private function getAxis(int $column, array $currentSample, array $samples): array
{
if (self::AXIS_ROW === $this->axis) {
return array_diff($currentSample, [$this->missingValue]);
}
$axis = [];
foreach ($samples as $sample) {
if ($sample[$column] !== $this->missingValue) {
$axis[] = $sample[$column];
}
}
return $axis;
}
}

View File

@ -0,0 +1,15 @@
<?php
declare (strict_types = 1);
namespace Phpml\Preprocessing\Imputer;
interface Strategy
{
/**
* @param array $currentAxis
*
* @return mixed
*/
public function replaceValue(array $currentAxis);
}

View File

@ -0,0 +1,21 @@
<?php
declare (strict_types = 1);
namespace Phpml\Preprocessing\Imputer\Strategy;
use Phpml\Preprocessing\Imputer\Strategy;
use Phpml\Math\Statistic\Mean;
class MeanStrategy implements Strategy
{
/**
* @param array $currentAxis
*
* @return float
*/
public function replaceValue(array $currentAxis)
{
return Mean::arithmetic($currentAxis);
}
}

View File

@ -0,0 +1,21 @@
<?php
declare (strict_types = 1);
namespace Phpml\Preprocessing\Imputer\Strategy;
use Phpml\Preprocessing\Imputer\Strategy;
use Phpml\Math\Statistic\Mean;
class MedianStrategy implements Strategy
{
/**
* @param array $currentAxis
*
* @return float
*/
public function replaceValue(array $currentAxis)
{
return Mean::median($currentAxis);
}
}

View File

@ -0,0 +1,21 @@
<?php
declare (strict_types = 1);
namespace Phpml\Preprocessing\Imputer\Strategy;
use Phpml\Preprocessing\Imputer\Strategy;
use Phpml\Math\Statistic\Mean;
class MostFrequentStrategy implements Strategy
{
/**
* @param array $currentAxis
*
* @return float|mixed
*/
public function replaceValue(array $currentAxis)
{
return Mean::mode($currentAxis);
}
}

View File

@ -0,0 +1,83 @@
<?php
declare (strict_types = 1);
namespace Phpml\Preprocessing;
use Phpml\Exception\NormalizerException;
class Normalizer implements Preprocessor
{
const NORM_L1 = 1;
const NORM_L2 = 2;
/**
* @var int
*/
private $norm;
/**
* @param int $norm
*
* @throws NormalizerException
*/
public function __construct(int $norm = self::NORM_L2)
{
if (!in_array($norm, [self::NORM_L1, self::NORM_L2])) {
throw NormalizerException::unknownNorm();
}
$this->norm = $norm;
}
/**
* @param array $samples
*/
public function preprocess(array &$samples)
{
$method = sprintf('normalizeL%s', $this->norm);
foreach ($samples as &$sample) {
$this->$method($sample);
}
}
/**
* @param array $sample
*/
private function normalizeL1(array &$sample)
{
$norm1 = 0;
foreach ($sample as $feature) {
$norm1 += abs($feature);
}
if (0 == $norm1) {
$count = count($sample);
$sample = array_fill(0, $count, 1.0 / $count);
} else {
foreach ($sample as &$feature) {
$feature = $feature / $norm1;
}
}
}
/**
* @param array $sample
*/
private function normalizeL2(array &$sample)
{
$norm2 = 0;
foreach ($sample as $feature) {
$norm2 += $feature * $feature;
}
$norm2 = sqrt($norm2);
if (0 == $norm2) {
$sample = array_fill(0, count($sample), 1);
} else {
foreach ($sample as &$feature) {
$feature = $feature / $norm2;
}
}
}
}

View File

@ -0,0 +1,13 @@
<?php
declare (strict_types = 1);
namespace Phpml\Preprocessing;
interface Preprocessor
{
/**
* @param array $samples
*/
public function preprocess(array &$samples);
}

View File

@ -4,10 +4,12 @@ declare (strict_types = 1);
namespace Phpml\Regression;
use Phpml\Helper\Predictable;
use Phpml\Math\Matrix;
class LeastSquares implements Regression
{
use Predictable;
/**
* @var array
*/
@ -45,7 +47,7 @@ class LeastSquares implements Regression
*
* @return mixed
*/
public function predict($sample)
public function predictSample(array $sample)
{
$result = $this->intercept;
foreach ($this->coefficients as $index => $coefficient) {

View File

@ -13,9 +13,9 @@ interface Regression
public function train(array $samples, array $targets);
/**
* @param float $sample
* @param array $samples
*
* @return mixed
*/
public function predict($sample);
public function predict(array $samples);
}

View File

@ -0,0 +1,31 @@
<?php
declare (strict_types = 1);
namespace Phpml\Regression;
use Phpml\SupportVectorMachine\Kernel;
use Phpml\SupportVectorMachine\SupportVectorMachine;
use Phpml\SupportVectorMachine\Type;
class SVR extends SupportVectorMachine implements Regression
{
/**
* @param int $kernel
* @param int $degree
* @param float $epsilon
* @param float $cost
* @param float|null $gamma
* @param float $coef0
* @param float $tolerance
* @param int $cacheSize
* @param bool $shrinking
*/
public function __construct(
int $kernel = Kernel::RBF, int $degree = 3, float $epsilon = 0.1, float $cost = 1.0,
float $gamma = null, float $coef0 = 0.0, float $tolerance = 0.001,
int $cacheSize = 100, bool $shrinking = true
) {
parent::__construct(Type::EPSILON_SVR, $kernel, $cost, 0.5, $degree, $gamma, $coef0, $epsilon, $tolerance, $cacheSize, $shrinking, false);
}
}

View File

@ -0,0 +1,101 @@
<?php
declare (strict_types = 1);
namespace Phpml\SupportVectorMachine;
class DataTransformer
{
/**
* @param array $samples
* @param array $labels
* @param bool $targets
*
* @return string
*/
public static function trainingSet(array $samples, array $labels, bool $targets = false): string
{
$set = '';
if (!$targets) {
$numericLabels = self::numericLabels($labels);
}
foreach ($labels as $index => $label) {
$set .= sprintf('%s %s %s', ($targets ? $label : $numericLabels[$label]), self::sampleRow($samples[$index]), PHP_EOL);
}
return $set;
}
/**
* @param array $samples
*
* @return string
*/
public static function testSet(array $samples): string
{
if (!is_array($samples[0])) {
$samples = [$samples];
}
$set = '';
foreach ($samples as $sample) {
$set .= sprintf('0 %s %s', self::sampleRow($sample), PHP_EOL);
}
return $set;
}
/**
* @param string $rawPredictions
* @param array $labels
*
* @return array
*/
public static function predictions(string $rawPredictions, array $labels): array
{
$numericLabels = self::numericLabels($labels);
$results = [];
foreach (explode(PHP_EOL, $rawPredictions) as $result) {
if (strlen($result) > 0) {
$results[] = array_search($result, $numericLabels);
}
}
return $results;
}
/**
* @param array $labels
*
* @return array
*/
public static function numericLabels(array $labels): array
{
$numericLabels = [];
foreach ($labels as $label) {
if (isset($numericLabels[$label])) {
continue;
}
$numericLabels[$label] = count($numericLabels);
}
return $numericLabels;
}
/**
* @param array $sample
*
* @return string
*/
private static function sampleRow(array $sample): string
{
$row = [];
foreach ($sample as $index => $feature) {
$row[] = sprintf('%s:%s', $index + 1, $feature);
}
return implode(' ', $row);
}
}

View File

@ -0,0 +1,28 @@
<?php
declare (strict_types = 1);
namespace Phpml\SupportVectorMachine;
abstract class Kernel
{
/**
* u'*v.
*/
const LINEAR = 0;
/**
* (gamma*u'*v + coef0)^degree.
*/
const POLYNOMIAL = 1;
/**
* exp(-gamma*|u-v|^2).
*/
const RBF = 2;
/**
* tanh(gamma*u'*v + coef0).
*/
const SIGMOID = 3;
}

View File

@ -0,0 +1,230 @@
<?php
declare (strict_types = 1);
namespace Phpml\SupportVectorMachine;
class SupportVectorMachine
{
/**
* @var int
*/
private $type;
/**
* @var int
*/
private $kernel;
/**
* @var float
*/
private $cost;
/**
* @var float
*/
private $nu;
/**
* @var int
*/
private $degree;
/**
* @var float
*/
private $gamma;
/**
* @var float
*/
private $coef0;
/**
* @var float
*/
private $epsilon;
/**
* @var float
*/
private $tolerance;
/**
* @var int
*/
private $cacheSize;
/**
* @var bool
*/
private $shrinking;
/**
* @var bool
*/
private $probabilityEstimates;
/**
* @var string
*/
private $binPath;
/**
* @var string
*/
private $varPath;
/**
* @var string
*/
private $model;
/**
* @var array
*/
private $labels;
/**
* @param int $type
* @param int $kernel
* @param float $cost
* @param float $nu
* @param int $degree
* @param float|null $gamma
* @param float $coef0
* @param float $epsilon
* @param float $tolerance
* @param int $cacheSize
* @param bool $shrinking
* @param bool $probabilityEstimates
*/
public function __construct(
int $type, int $kernel, float $cost = 1.0, float $nu = 0.5, int $degree = 3,
float $gamma = null, float $coef0 = 0.0, float $epsilon = 0.1, float $tolerance = 0.001,
int $cacheSize = 100, bool $shrinking = true, bool $probabilityEstimates = false
) {
$this->type = $type;
$this->kernel = $kernel;
$this->cost = $cost;
$this->nu = $nu;
$this->degree = $degree;
$this->gamma = $gamma;
$this->coef0 = $coef0;
$this->epsilon = $epsilon;
$this->tolerance = $tolerance;
$this->cacheSize = $cacheSize;
$this->shrinking = $shrinking;
$this->probabilityEstimates = $probabilityEstimates;
$rootPath = realpath(implode(DIRECTORY_SEPARATOR, [dirname(__FILE__), '..', '..', '..'])).DIRECTORY_SEPARATOR;
$this->binPath = $rootPath.'bin'.DIRECTORY_SEPARATOR.'libsvm'.DIRECTORY_SEPARATOR;
$this->varPath = $rootPath.'var'.DIRECTORY_SEPARATOR;
}
/**
* @param array $samples
* @param array $labels
*/
public function train(array $samples, array $labels)
{
$this->labels = $labels;
$trainingSet = DataTransformer::trainingSet($samples, $labels, in_array($this->type, [Type::EPSILON_SVR, Type::NU_SVR]));
file_put_contents($trainingSetFileName = $this->varPath.uniqid(), $trainingSet);
$modelFileName = $trainingSetFileName.'-model';
$command = $this->buildTrainCommand($trainingSetFileName, $modelFileName);
$output = '';
exec(escapeshellcmd($command), $output);
$this->model = file_get_contents($modelFileName);
unlink($trainingSetFileName);
unlink($modelFileName);
}
/**
* @return string
*/
public function getModel()
{
return $this->model;
}
/**
* @param array $samples
*
* @return array
*/
public function predict(array $samples)
{
$testSet = DataTransformer::testSet($samples);
file_put_contents($testSetFileName = $this->varPath.uniqid(), $testSet);
file_put_contents($modelFileName = $testSetFileName.'-model', $this->model);
$outputFileName = $testSetFileName.'-output';
$command = sprintf('%ssvm-predict%s %s %s %s', $this->binPath, $this->getOSExtension(), $testSetFileName, $modelFileName, $outputFileName);
$output = '';
exec(escapeshellcmd($command), $output);
$predictions = file_get_contents($outputFileName);
unlink($testSetFileName);
unlink($modelFileName);
unlink($outputFileName);
if (in_array($this->type, [Type::C_SVC, Type::NU_SVC])) {
$predictions = DataTransformer::predictions($predictions, $this->labels);
} else {
$predictions = explode(PHP_EOL, trim($predictions));
}
if (!is_array($samples[0])) {
return $predictions[0];
}
return $predictions;
}
/**
* @return string
*/
private function getOSExtension()
{
if (strtoupper(substr(PHP_OS, 0, 3)) === 'WIN') {
return '.exe';
}
return '';
}
/**
* @param $trainingSetFileName
* @param $modelFileName
*
* @return string
*/
private function buildTrainCommand(string $trainingSetFileName, string $modelFileName): string
{
return sprintf('%ssvm-train%s -s %s -t %s -c %s -n %s -d %s%s -r %s -p %s -m %s -e %s -h %d -b %d \'%s\' \'%s\'',
$this->binPath,
$this->getOSExtension(),
$this->type,
$this->kernel,
$this->cost,
$this->nu,
$this->degree,
$this->gamma !== null ? ' -g '.$this->gamma : '',
$this->coef0,
$this->epsilon,
$this->cacheSize,
$this->tolerance,
$this->shrinking,
$this->probabilityEstimates,
$trainingSetFileName,
$modelFileName
);
}
}

View File

@ -0,0 +1,33 @@
<?php
declare (strict_types = 1);
namespace Phpml\SupportVectorMachine;
abstract class Type
{
/**
* classification.
*/
const C_SVC = 0;
/**
* classification.
*/
const NU_SVC = 1;
/**
* distribution estimation.
*/
const ONE_CLASS_SVM = 2;
/**
* regression.
*/
const EPSILON_SVR = 3;
/**
* regression.
*/
const NU_SVR = 4;
}

View File

@ -0,0 +1,15 @@
<?php
declare (strict_types = 1);
namespace Phpml\Tokenization;
interface Tokenizer
{
/**
* @param string $text
*
* @return array
*/
public function tokenize(string $text): array;
}

View File

@ -0,0 +1,18 @@
<?php
declare (strict_types = 1);
namespace Phpml\Tokenization;
class WhitespaceTokenizer implements Tokenizer
{
/**
* @param string $text
*
* @return array
*/
public function tokenize(string $text): array
{
return preg_split('/[\pZ\pC]+/u', $text, -1, PREG_SPLIT_NO_EMPTY);
}
}

View File

@ -0,0 +1,21 @@
<?php
declare (strict_types = 1);
namespace Phpml\Tokenization;
class WordTokenizer implements Tokenizer
{
/**
* @param string $text
*
* @return array
*/
public function tokenize(string $text): array
{
$tokens = [];
preg_match_all('/\w\w+/u', $text, $tokens);
return $tokens[0];
}
}

View File

@ -0,0 +1,45 @@
<?php
declare (strict_types = 1);
namespace tests\Classification;
use Phpml\Classification\SVC;
use Phpml\SupportVectorMachine\Kernel;
class SVCTest extends \PHPUnit_Framework_TestCase
{
public function testPredictSingleSampleWithLinearKernel()
{
$samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]];
$labels = ['a', 'a', 'a', 'b', 'b', 'b'];
$classifier = new SVC(Kernel::LINEAR, $cost = 1000);
$classifier->train($samples, $labels);
$this->assertEquals('b', $classifier->predict([3, 2]));
$this->assertEquals('b', $classifier->predict([5, 1]));
$this->assertEquals('b', $classifier->predict([4, 3]));
$this->assertEquals('b', $classifier->predict([4, -5]));
$this->assertEquals('a', $classifier->predict([2, 3]));
$this->assertEquals('a', $classifier->predict([1, 2]));
$this->assertEquals('a', $classifier->predict([1, 5]));
$this->assertEquals('a', $classifier->predict([3, 10]));
}
public function testPredictArrayOfSamplesWithLinearKernel()
{
$trainSamples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]];
$trainLabels = ['a', 'a', 'a', 'b', 'b', 'b'];
$testSamples = [[3, 2], [5, 1], [4, 3], [4, -5], [2, 3], [1, 2], [1, 5], [3, 10]];
$testLabels = ['b', 'b', 'b', 'b', 'a', 'a', 'a', 'a'];
$classifier = new SVC(Kernel::LINEAR, $cost = 1000);
$classifier->train($trainSamples, $trainLabels);
$predictions = $classifier->predict($testSamples);
$this->assertEquals($testLabels, $predictions);
}
}

View File

@ -0,0 +1,73 @@
<?php
declare (strict_types = 1);
namespace tests\Phpml\FeatureExtraction;
use Phpml\FeatureExtraction\TokenCountVectorizer;
use Phpml\Tokenization\WhitespaceTokenizer;
class TokenCountVectorizerTest extends \PHPUnit_Framework_TestCase
{
public function testTokenCountVectorizerWithWhitespaceTokenizer()
{
$samples = [
'Lorem ipsum dolor sit amet dolor',
'Mauris placerat ipsum dolor',
'Mauris diam eros fringilla diam',
];
$vocabulary = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', 'Mauris', 'placerat', 'diam', 'eros', 'fringilla'];
$vector = [
[0 => 1, 1 => 1, 2 => 2, 3 => 1, 4 => 1],
[5 => 1, 6 => 1, 1 => 1, 2 => 1],
[5 => 1, 7 => 2, 8 => 1, 9 => 1],
];
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer());
$this->assertEquals($vector, $vectorizer->transform($samples));
$this->assertEquals($vocabulary, $vectorizer->getVocabulary());
}
public function testMinimumDocumentTokenCountFrequency()
{
// word at least in half samples
$samples = [
'Lorem ipsum dolor sit amet',
'Lorem ipsum sit amet',
'ipsum sit amet',
'ipsum sit amet',
];
$vocabulary = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet'];
$vector = [
[0 => 1, 1 => 1, 3 => 1, 4 => 1],
[0 => 1, 1 => 1, 3 => 1, 4 => 1],
[1 => 1, 3 => 1, 4 => 1],
[1 => 1, 3 => 1, 4 => 1],
];
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), 0.5);
$this->assertEquals($vector, $vectorizer->transform($samples));
$this->assertEquals($vocabulary, $vectorizer->getVocabulary());
// word at least in all samples
$samples = [
'Lorem ipsum dolor sit amet',
'Morbi quis lacinia arcu. Sed eu sagittis Lorem',
'Suspendisse gravida consequat eros Lorem',
];
$vector = [
[0 => 1],
[0 => 1],
[0 => 1],
];
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), 1);
$this->assertEquals($vector, $vectorizer->transform($samples));
}
}

View File

@ -8,6 +8,14 @@ use Phpml\Math\Statistic\Mean;
class MeanTest extends \PHPUnit_Framework_TestCase
{
/**
* @expectedException \Phpml\Exception\InvalidArgumentException
*/
public function testArithmeticThrowExceptionOnEmptyArray()
{
Mean::arithmetic([]);
}
public function testArithmeticMean()
{
$delta = 0.01;
@ -15,4 +23,41 @@ class MeanTest extends \PHPUnit_Framework_TestCase
$this->assertEquals(41.16, Mean::arithmetic([43, 21, 25, 42, 57, 59]), '', $delta);
$this->assertEquals(1.7, Mean::arithmetic([0.5, 0.5, 1.5, 2.5, 3.5]), '', $delta);
}
/**
* @expectedException \Phpml\Exception\InvalidArgumentException
*/
public function testMedianThrowExceptionOnEmptyArray()
{
Mean::median([]);
}
public function testMedianOnOddLengthArray()
{
$numbers = [5, 2, 6, 1, 3];
$this->assertEquals(3, Mean::median($numbers));
}
public function testMedianOnEvenLengthArray()
{
$numbers = [5, 2, 6, 1, 3, 4];
$this->assertEquals(3.5, Mean::median($numbers));
}
/**
* @expectedException \Phpml\Exception\InvalidArgumentException
*/
public function testModeThrowExceptionOnEmptyArray()
{
Mean::mode([]);
}
public function testModeOnArray()
{
$numbers = [5, 2, 6, 1, 3, 4, 6, 6, 5];
$this->assertEquals(6, Mean::mode($numbers));
}
}

View File

@ -0,0 +1,149 @@
<?php
declare (strict_types = 1);
namespace tests\Preprocessing;
use Phpml\Preprocessing\Imputer;
use Phpml\Preprocessing\Imputer\Strategy\MeanStrategy;
use Phpml\Preprocessing\Imputer\Strategy\MedianStrategy;
use Phpml\Preprocessing\Imputer\Strategy\MostFrequentStrategy;
class ImputerTest extends \PHPUnit_Framework_TestCase
{
public function testComplementsMissingValuesWithMeanStrategyOnColumnAxis()
{
$data = [
[1, null, 3, 4],
[4, 3, 2, 1],
[null, 6, 7, 8],
[8, 7, null, 5],
];
$imputeData = [
[1, 5.33, 3, 4],
[4, 3, 2, 1],
[4.33, 6, 7, 8],
[8, 7, 4, 5],
];
$imputer = new Imputer(null, new MeanStrategy(), Imputer::AXIS_COLUMN);
$imputer->preprocess($data);
$this->assertEquals($imputeData, $data, '', $delta = 0.01);
}
public function testComplementsMissingValuesWithMeanStrategyOnRowAxis()
{
$data = [
[1, null, 3, 4],
[4, 3, 2, 1],
[null, 6, 7, 8],
[8, 7, null, 5],
];
$imputeData = [
[1, 2.66, 3, 4],
[4, 3, 2, 1],
[7, 6, 7, 8],
[8, 7, 6.66, 5],
];
$imputer = new Imputer(null, new MeanStrategy(), Imputer::AXIS_ROW);
$imputer->preprocess($data);
$this->assertEquals($imputeData, $data, '', $delta = 0.01);
}
public function testComplementsMissingValuesWithMediaStrategyOnColumnAxis()
{
$data = [
[1, null, 3, 4],
[4, 3, 2, 1],
[null, 6, 7, 8],
[8, 7, null, 5],
];
$imputeData = [
[1, 6, 3, 4],
[4, 3, 2, 1],
[4, 6, 7, 8],
[8, 7, 3, 5],
];
$imputer = new Imputer(null, new MedianStrategy(), Imputer::AXIS_COLUMN);
$imputer->preprocess($data);
$this->assertEquals($imputeData, $data, '', $delta = 0.01);
}
public function testComplementsMissingValuesWithMediaStrategyOnRowAxis()
{
$data = [
[1, null, 3, 4],
[4, 3, 2, 1],
[null, 6, 7, 8],
[8, 7, null, 5],
];
$imputeData = [
[1, 3, 3, 4],
[4, 3, 2, 1],
[7, 6, 7, 8],
[8, 7, 7, 5],
];
$imputer = new Imputer(null, new MedianStrategy(), Imputer::AXIS_ROW);
$imputer->preprocess($data);
$this->assertEquals($imputeData, $data, '', $delta = 0.01);
}
public function testComplementsMissingValuesWithMostFrequentStrategyOnColumnAxis()
{
$data = [
[1, null, 3, 4],
[4, 3, 2, 1],
[null, 6, 7, 8],
[8, 7, null, 5],
[8, 3, 2, 5],
];
$imputeData = [
[1, 3, 3, 4],
[4, 3, 2, 1],
[8, 6, 7, 8],
[8, 7, 2, 5],
[8, 3, 2, 5],
];
$imputer = new Imputer(null, new MostFrequentStrategy(), Imputer::AXIS_COLUMN);
$imputer->preprocess($data);
$this->assertEquals($imputeData, $data);
}
public function testComplementsMissingValuesWithMostFrequentStrategyOnRowAxis()
{
$data = [
[1, null, 3, 4, 3],
[4, 3, 2, 1, 7],
[null, 6, 7, 8, 6],
[8, 7, null, 5, 5],
[8, 3, 2, 5, 4],
];
$imputeData = [
[1, 3, 3, 4, 3],
[4, 3, 2, 1, 7],
[6, 6, 7, 8, 6],
[8, 7, 5, 5, 5],
[8, 3, 2, 5, 4],
];
$imputer = new Imputer(null, new MostFrequentStrategy(), Imputer::AXIS_ROW);
$imputer->preprocess($data);
$this->assertEquals($imputeData, $data);
}
}

View File

@ -0,0 +1,58 @@
<?php
declare (strict_types = 1);
namespace tests\Preprocessing;
use Phpml\Preprocessing\Normalizer;
class NormalizerTest extends \PHPUnit_Framework_TestCase
{
/**
* @expectedException \Phpml\Exception\NormalizerException
*/
public function testThrowExceptionOnInvalidNorm()
{
new Normalizer(99);
}
public function testNormalizeSamplesWithL2Norm()
{
$samples = [
[1, -1, 2],
[2, 0, 0],
[0, 1, -1],
];
$normalized = [
[0.4, -0.4, 0.81],
[1.0, 0.0, 0.0],
[0.0, 0.7, -0.7],
];
$normalizer = new Normalizer();
$normalizer->preprocess($samples);
$this->assertEquals($normalized, $samples, '', $delta = 0.01);
}
public function testNormalizeSamplesWithL1Norm()
{
$samples = [
[1, -1, 2],
[2, 0, 0],
[0, 1, -1],
];
$normalized = [
[0.25, -0.25, 0.5],
[1.0, 0.0, 0.0],
[0.0, 0.5, -0.5],
];
$normalizer = new Normalizer(Normalizer::NORM_L1);
$normalizer->preprocess($samples);
$this->assertEquals($normalized, $samples, '', $delta = 0.01);
}
}

View File

@ -0,0 +1,37 @@
<?php
declare (strict_types = 1);
namespace tests\Regression;
use Phpml\Regression\SVR;
use Phpml\SupportVectorMachine\Kernel;
class SVRTest extends \PHPUnit_Framework_TestCase
{
public function testPredictSingleFeatureSamples()
{
$delta = 0.01;
$samples = [[60], [61], [62], [63], [65]];
$targets = [3.1, 3.6, 3.8, 4, 4.1];
$regression = new SVR(Kernel::LINEAR);
$regression->train($samples, $targets);
$this->assertEquals(4.03, $regression->predict([64]), '', $delta);
}
public function testPredictMultiFeaturesSamples()
{
$delta = 0.01;
$samples = [[73676, 1996], [77006, 1998], [10565, 2000], [146088, 1995], [15000, 2001], [65940, 2000], [9300, 2000], [93739, 1996], [153260, 1994], [17764, 2002], [57000, 1998], [15000, 2000]];
$targets = [2000, 2750, 15500, 960, 4400, 8800, 7100, 2550, 1025, 5900, 4600, 4400];
$regression = new SVR(Kernel::LINEAR);
$regression->train($samples, $targets);
$this->assertEquals([4109.82, 4112.28], $regression->predict([[60000, 1996], [60000, 2000]]), '', $delta);
}
}

View File

@ -0,0 +1,39 @@
<?php
declare (strict_types = 1);
namespace tests\SupportVectorMachine;
use Phpml\SupportVectorMachine\DataTransformer;
class DataTransformerTest extends \PHPUnit_Framework_TestCase
{
public function testTransformDatasetToTrainingSet()
{
$samples = [[1, 1], [2, 1], [3, 2], [4, 5]];
$labels = ['a', 'a', 'b', 'b'];
$trainingSet =
'0 1:1 2:1 '.PHP_EOL.
'0 1:2 2:1 '.PHP_EOL.
'1 1:3 2:2 '.PHP_EOL.
'1 1:4 2:5 '.PHP_EOL
;
$this->assertEquals($trainingSet, DataTransformer::trainingSet($samples, $labels));
}
public function testTransformSamplesToTestSet()
{
$samples = [[1, 1], [2, 1], [3, 2], [4, 5]];
$testSet =
'0 1:1 2:1 '.PHP_EOL.
'0 1:2 2:1 '.PHP_EOL.
'0 1:3 2:2 '.PHP_EOL.
'0 1:4 2:5 '.PHP_EOL
;
$this->assertEquals($testSet, DataTransformer::testSet($samples));
}
}

View File

@ -0,0 +1,82 @@
<?php
declare (strict_types = 1);
namespace tests\SupportVectorMachine;
use Phpml\SupportVectorMachine\Kernel;
use Phpml\SupportVectorMachine\SupportVectorMachine;
use Phpml\SupportVectorMachine\Type;
class SupportVectorMachineTest extends \PHPUnit_Framework_TestCase
{
public function testTrainCSVCModelWithLinearKernel()
{
$samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]];
$labels = ['a', 'a', 'a', 'b', 'b', 'b'];
$model =
'svm_type c_svc
kernel_type linear
nr_class 2
total_sv 2
rho 0
label 0 1
nr_sv 1 1
SV
0.25 1:2 2:4
-0.25 1:4 2:2
';
$svm = new SupportVectorMachine(Type::C_SVC, Kernel::LINEAR, 100.0);
$svm->train($samples, $labels);
$this->assertEquals($model, $svm->getModel());
}
public function testPredictSampleWithLinearKernel()
{
$samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]];
$labels = ['a', 'a', 'a', 'b', 'b', 'b'];
$svm = new SupportVectorMachine(Type::C_SVC, Kernel::LINEAR, 100.0);
$svm->train($samples, $labels);
$predictions = $svm->predict([
[3, 2],
[2, 3],
[4, -5],
]);
$this->assertEquals('b', $predictions[0]);
$this->assertEquals('a', $predictions[1]);
$this->assertEquals('b', $predictions[2]);
}
public function testPredictSampleFromMultipleClassWithRbfKernel()
{
$samples = [
[1, 3], [1, 4], [1, 4],
[3, 1], [4, 1], [4, 2],
[-3, -1], [-4, -1], [-4, -2],
];
$labels = [
'a', 'a', 'a',
'b', 'b', 'b',
'c', 'c', 'c',
];
$svm = new SupportVectorMachine(Type::C_SVC, Kernel::RBF, 100.0);
$svm->train($samples, $labels);
$predictions = $svm->predict([
[1, 5],
[4, 3],
[-4, -3],
]);
$this->assertEquals('a', $predictions[0]);
$this->assertEquals('b', $predictions[1]);
$this->assertEquals('c', $predictions[2]);
}
}

View File

@ -0,0 +1,40 @@
<?php
declare (strict_types = 1);
namespace tests\Tokenization;
use Phpml\Tokenization\WhitespaceTokenizer;
class WhitespaceTokenizerTest extends \PHPUnit_Framework_TestCase
{
public function testTokenizationOnAscii()
{
$tokenizer = new WhitespaceTokenizer();
$text = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Cras consectetur, dui et lobortis auctor.
Nulla vitae congue lorem.';
$tokens = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet,', 'consectetur', 'adipiscing', 'elit.',
'Cras', 'consectetur,', 'dui', 'et', 'lobortis', 'auctor.',
'Nulla', 'vitae', 'congue', 'lorem.', ];
$this->assertEquals($tokens, $tokenizer->tokenize($text));
}
public function testTokenizationOnUtf8()
{
$tokenizer = new WhitespaceTokenizer();
$text = '鋍鞎 鞮鞢騉 袟袘觕, 炟砏 謺貙蹖 偢偣唲 箷箯緷 鑴鱱爧 覮轀,
剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢 枲柊氠 鍎鞚韕 焲犈,
殍涾烰 齞齝囃 蹅輶 , 孻憵 擙樲橚 藒襓謥 岯岪弨 廞徲 孻憵懥 趡趛踠 ';
$tokens = ['鋍鞎', '鳼', '鞮鞢騉', '袟袘觕,', '炟砏', '蒮', '謺貙蹖', '偢偣唲', '蒛', '箷箯緷', '鑴鱱爧', '覮轀,',
'剆坲', '煘煓瑐', '鬐鶤鶐', '飹勫嫢', '銪', '餀', '枲柊氠', '鍎鞚韕', '焲犈,',
'殍涾烰', '齞齝囃', '蹅輶', '鄜,', '孻憵', '擙樲橚', '藒襓謥', '岯岪弨', '蒮', '廞徲', '孻憵懥', '趡趛踠', '槏', ];
$this->assertEquals($tokens, $tokenizer->tokenize($text));
}
}

View File

@ -0,0 +1,40 @@
<?php
declare (strict_types = 1);
namespace tests\Tokenization;
use Phpml\Tokenization\WordTokenizer;
class WordTokenizerTest extends \PHPUnit_Framework_TestCase
{
public function testTokenizationOnAscii()
{
$tokenizer = new WordTokenizer();
$text = 'Lorem ipsum-dolor sit amet, consectetur/adipiscing elit.
Cras consectetur, dui et lobortis;auctor.
Nulla vitae ,.,/ congue lorem.';
$tokens = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', 'consectetur', 'adipiscing', 'elit',
'Cras', 'consectetur', 'dui', 'et', 'lobortis', 'auctor',
'Nulla', 'vitae', 'congue', 'lorem', ];
$this->assertEquals($tokens, $tokenizer->tokenize($text));
}
public function testTokenizationOnUtf8()
{
$tokenizer = new WordTokenizer();
$text = '鋍鞎 鞮鞢騉 袟袘觕, 炟砏 謺貙蹖 偢偣唲 箷箯緷 鑴鱱爧 覮轀,
剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢 枲柊氠 鍎鞚韕 焲犈,
殍涾烰 齞齝囃 蹅輶 , 孻憵 擙樲橚 藒襓謥 岯岪弨 廞徲 孻憵懥 趡趛踠 ';
$tokens = ['鋍鞎', '鞮鞢騉', '袟袘觕', '炟砏', '謺貙蹖', '偢偣唲', '箷箯緷', '鑴鱱爧', '覮轀',
'剆坲', '煘煓瑐', '鬐鶤鶐', '飹勫嫢', '枲柊氠', '鍎鞚韕', '焲犈',
'殍涾烰', '齞齝囃', '蹅輶', '孻憵', '擙樲橚', '藒襓謥', '岯岪弨', '廞徲', '孻憵懥', '趡趛踠', ];
$this->assertEquals($tokens, $tokenizer->tokenize($text));
}
}

0
var/.gitkeep Normal file
View File