diff --git a/README.md b/README.md index 457f3c1..feac3e5 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,24 @@ -# PHP Machine learning library +# PHP Machine Learning library + +[![Build Status](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/build.png?b=develop)](https://scrutinizer-ci.com/g/php-ai/php-ml/build-status/develop) +[![Documentation Status](https://readthedocs.org/projects/php-ml/badge/?version=develop)](http://php-ml.readthedocs.org/en/develop/?badge=develop) +[![Total Downloads](https://poser.pugx.org/php-ai/php-ml/downloads.svg)](https://packagist.org/packages/php-ai/php-ml) +[![License](https://poser.pugx.org/php-ai/php-ml/license.svg)](https://packagist.org/packages/php-ai/php-ml) +[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/quality-score.png?b=develop)](https://scrutinizer-ci.com/g/php-ai/php-ml/?branch=develop) Fresh approach to machine learning in PHP. Note that at the moment PHP is not the best choice for machine learning but maybe this will change ... -## Available Algorithms +## Documentation -### Classification - -Identifying to which category an object belongs to. - -* **Naive Bayes** - algorithm based on applying Bayes’ theorem with the “naive” assumption of independence between every pair of features +To find out how to use PHP-ML follow [Documentation](php-ml.readthedocs.org). ## Installation -This repo will be published do packagist.org soon... +Currently this library is in the process of developing, but You can install it with Composer: + +``` +composer require php-ai/php-ml +``` ## To-Do @@ -21,7 +27,7 @@ This repo will be published do packagist.org soon... ## Testing -After installation, you can launch the test suite in project root directory (you will need to install dev requiremnts with composer) +After installation, you can launch the test suite in project root directory (you will need to install dev requirements with composer) ``` bin/phpunit @@ -33,3 +39,4 @@ PHP-ML is released under the MIT Licence. See the bundled LICENSE file for detai ## Author +Arkadiusz Kondas (@ArkadiuszKondas) \ No newline at end of file diff --git a/composer.json b/composer.json index 7f6a744..4f0dd26 100644 --- a/composer.json +++ b/composer.json @@ -1,7 +1,7 @@ { "name": "php-ai/php-ml", "type": "library", - "description": "PHP Machine learning library", + "description": "PHP Machine Learning library", "license": "MIT", "keywords": ["machine learning","pattern recognition","computational learning theory","artificial intelligence"], "homepage": "https://github.com/php-ai/php-ml", @@ -11,11 +11,16 @@ "email": "arkadiusz.kondas@gmail.com" } ], + "autoload": { + "psr-0": { + "Phpml": "src/" + } + }, "config": { "bin-dir": "bin" }, "require": { - "php": ">=5.5.0" + "php": ">=7.0.0" }, "require-dev": { "phpunit/phpunit": "^5.2" diff --git a/composer.lock b/composer.lock index 1117546..667ba6a 100644 --- a/composer.lock +++ b/composer.lock @@ -4,8 +4,8 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file", "This file is @generated automatically" ], - "hash": "640f762012a359b150ce245491743448", - "content-hash": "5efa8db5a672e2128d20c80c18746c72", + "hash": "7c34eebd6b8749a1cd09df57e5d1f47a", + "content-hash": "087091d0c339e9fa3a551a189ea658bf", "packages": [], "packages-dev": [ { @@ -155,22 +155,24 @@ }, { "name": "phpspec/prophecy", - "version": "v1.5.0", + "version": "v1.6.0", "source": { "type": "git", "url": "https://github.com/phpspec/prophecy.git", - "reference": "4745ded9307786b730d7a60df5cb5a6c43cf95f7" + "reference": "3c91bdf81797d725b14cb62906f9a4ce44235972" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/phpspec/prophecy/zipball/4745ded9307786b730d7a60df5cb5a6c43cf95f7", - "reference": "4745ded9307786b730d7a60df5cb5a6c43cf95f7", + "url": "https://api.github.com/repos/phpspec/prophecy/zipball/3c91bdf81797d725b14cb62906f9a4ce44235972", + "reference": "3c91bdf81797d725b14cb62906f9a4ce44235972", "shasum": "" }, "require": { "doctrine/instantiator": "^1.0.2", + "php": "^5.3|^7.0", "phpdocumentor/reflection-docblock": "~2.0", - "sebastian/comparator": "~1.1" + "sebastian/comparator": "~1.1", + "sebastian/recursion-context": "~1.0" }, "require-dev": { "phpspec/phpspec": "~2.0" @@ -178,7 +180,7 @@ "type": "library", "extra": { "branch-alias": { - "dev-master": "1.4.x-dev" + "dev-master": "1.5.x-dev" } }, "autoload": { @@ -211,27 +213,28 @@ "spy", "stub" ], - "time": "2015-08-13 10:07:40" + "time": "2016-02-15 07:46:21" }, { "name": "phpunit/php-code-coverage", - "version": "3.1.1", + "version": "3.3.0", "source": { "type": "git", "url": "https://github.com/sebastianbergmann/php-code-coverage.git", - "reference": "92f5c61b5c64159faec5298325ffab0c7e59dcc8" + "reference": "fe33716763b604ade4cb442c0794f5bd5ad73004" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/php-code-coverage/zipball/92f5c61b5c64159faec5298325ffab0c7e59dcc8", - "reference": "92f5c61b5c64159faec5298325ffab0c7e59dcc8", + "url": "https://api.github.com/repos/sebastianbergmann/php-code-coverage/zipball/fe33716763b604ade4cb442c0794f5bd5ad73004", + "reference": "fe33716763b604ade4cb442c0794f5bd5ad73004", "shasum": "" }, "require": { - "php": ">=5.6", + "php": "^5.6 || ^7.0", "phpunit/php-file-iterator": "~1.3", "phpunit/php-text-template": "~1.2", - "phpunit/php-token-stream": "~1.3", + "phpunit/php-token-stream": "^1.4.2", + "sebastian/code-unit-reverse-lookup": "~1.0", "sebastian/environment": "^1.3.2", "sebastian/version": "~1.0|~2.0" }, @@ -247,7 +250,7 @@ "type": "library", "extra": { "branch-alias": { - "dev-master": "3.1.x-dev" + "dev-master": "3.3.x-dev" } }, "autoload": { @@ -273,7 +276,7 @@ "testing", "xunit" ], - "time": "2016-02-04 13:05:19" + "time": "2016-03-03 08:49:08" }, { "name": "phpunit/php-file-iterator", @@ -455,16 +458,16 @@ }, { "name": "phpunit/phpunit", - "version": "5.2.3", + "version": "5.3.1", "source": { "type": "git", "url": "https://github.com/sebastianbergmann/phpunit.git", - "reference": "6fdb1d3004ebc7071c4ac62f2881d67c5c11fb59" + "reference": "34a3acb401ae79deb37bc6e5f5ec3d325b369b4c" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/phpunit/zipball/6fdb1d3004ebc7071c4ac62f2881d67c5c11fb59", - "reference": "6fdb1d3004ebc7071c4ac62f2881d67c5c11fb59", + "url": "https://api.github.com/repos/sebastianbergmann/phpunit/zipball/34a3acb401ae79deb37bc6e5f5ec3d325b369b4c", + "reference": "34a3acb401ae79deb37bc6e5f5ec3d325b369b4c", "shasum": "" }, "require": { @@ -474,18 +477,19 @@ "ext-reflection": "*", "ext-spl": "*", "myclabs/deep-copy": "~1.3", - "php": ">=5.6", + "php": "^5.6 || ^7.0", "phpspec/prophecy": "^1.3.1", - "phpunit/php-code-coverage": "~3.0", + "phpunit/php-code-coverage": "^3.3.0", "phpunit/php-file-iterator": "~1.4", "phpunit/php-text-template": "~1.2", - "phpunit/php-timer": ">=1.0.6", - "phpunit/phpunit-mock-objects": ">=3.0.5", + "phpunit/php-timer": "^1.0.6", + "phpunit/phpunit-mock-objects": "^3.1", "sebastian/comparator": "~1.1", "sebastian/diff": "~1.2", "sebastian/environment": "~1.3", "sebastian/exporter": "~1.2", "sebastian/global-state": "~1.0", + "sebastian/object-enumerator": "~1.0", "sebastian/resource-operations": "~1.0", "sebastian/version": "~1.0|~2.0", "symfony/yaml": "~2.1|~3.0" @@ -499,7 +503,7 @@ "type": "library", "extra": { "branch-alias": { - "dev-master": "5.2.x-dev" + "dev-master": "5.3.x-dev" } }, "autoload": { @@ -525,20 +529,20 @@ "testing", "xunit" ], - "time": "2016-02-08 12:15:53" + "time": "2016-04-07 07:04:34" }, { "name": "phpunit/phpunit-mock-objects", - "version": "3.0.6", + "version": "3.1.2", "source": { "type": "git", "url": "https://github.com/sebastianbergmann/phpunit-mock-objects.git", - "reference": "49bc700750196c04dd6bc2c4c99cb632b893836b" + "reference": "7c34c9bdde4131b824086457a3145e27dba10ca1" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/phpunit-mock-objects/zipball/49bc700750196c04dd6bc2c4c99cb632b893836b", - "reference": "49bc700750196c04dd6bc2c4c99cb632b893836b", + "url": "https://api.github.com/repos/sebastianbergmann/phpunit-mock-objects/zipball/7c34c9bdde4131b824086457a3145e27dba10ca1", + "reference": "7c34c9bdde4131b824086457a3145e27dba10ca1", "shasum": "" }, "require": { @@ -556,7 +560,7 @@ "type": "library", "extra": { "branch-alias": { - "dev-master": "3.0.x-dev" + "dev-master": "3.1.x-dev" } }, "autoload": { @@ -581,7 +585,52 @@ "mock", "xunit" ], - "time": "2015-12-08 08:47:06" + "time": "2016-03-24 05:58:25" + }, + { + "name": "sebastian/code-unit-reverse-lookup", + "version": "1.0.0", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/code-unit-reverse-lookup.git", + "reference": "c36f5e7cfce482fde5bf8d10d41a53591e0198fe" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/code-unit-reverse-lookup/zipball/c36f5e7cfce482fde5bf8d10d41a53591e0198fe", + "reference": "c36f5e7cfce482fde5bf8d10d41a53591e0198fe", + "shasum": "" + }, + "require": { + "php": ">=5.6" + }, + "require-dev": { + "phpunit/phpunit": "~5" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + } + ], + "description": "Looks up which function or method a line of code belongs to", + "homepage": "https://github.com/sebastianbergmann/code-unit-reverse-lookup/", + "time": "2016-02-13 06:45:14" }, { "name": "sebastian/comparator", @@ -701,16 +750,16 @@ }, { "name": "sebastian/environment", - "version": "1.3.3", + "version": "1.3.5", "source": { "type": "git", "url": "https://github.com/sebastianbergmann/environment.git", - "reference": "6e7133793a8e5a5714a551a8324337374be209df" + "reference": "dc7a29032cf72b54f36dac15a1ca5b3a1b6029bf" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/sebastianbergmann/environment/zipball/6e7133793a8e5a5714a551a8324337374be209df", - "reference": "6e7133793a8e5a5714a551a8324337374be209df", + "url": "https://api.github.com/repos/sebastianbergmann/environment/zipball/dc7a29032cf72b54f36dac15a1ca5b3a1b6029bf", + "reference": "dc7a29032cf72b54f36dac15a1ca5b3a1b6029bf", "shasum": "" }, "require": { @@ -747,7 +796,7 @@ "environment", "hhvm" ], - "time": "2015-12-02 08:37:27" + "time": "2016-02-26 18:40:46" }, { "name": "sebastian/exporter", @@ -866,6 +915,52 @@ ], "time": "2015-10-12 03:26:01" }, + { + "name": "sebastian/object-enumerator", + "version": "1.0.0", + "source": { + "type": "git", + "url": "https://github.com/sebastianbergmann/object-enumerator.git", + "reference": "d4ca2fb70344987502567bc50081c03e6192fb26" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/sebastianbergmann/object-enumerator/zipball/d4ca2fb70344987502567bc50081c03e6192fb26", + "reference": "d4ca2fb70344987502567bc50081c03e6192fb26", + "shasum": "" + }, + "require": { + "php": ">=5.6", + "sebastian/recursion-context": "~1.0" + }, + "require-dev": { + "phpunit/phpunit": "~5" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "classmap": [ + "src/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Sebastian Bergmann", + "email": "sebastian@phpunit.de" + } + ], + "description": "Traverses array structures and object graphs to enumerate all referenced objects", + "homepage": "https://github.com/sebastianbergmann/object-enumerator/", + "time": "2016-01-28 13:25:10" + }, { "name": "sebastian/recursion-context", "version": "1.0.2", @@ -1006,16 +1101,16 @@ }, { "name": "symfony/yaml", - "version": "v3.0.2", + "version": "v3.0.4", "source": { "type": "git", "url": "https://github.com/symfony/yaml.git", - "reference": "3cf0709d7fe936e97bee9e954382e449003f1d9a" + "reference": "0047c8366744a16de7516622c5b7355336afae96" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/yaml/zipball/3cf0709d7fe936e97bee9e954382e449003f1d9a", - "reference": "3cf0709d7fe936e97bee9e954382e449003f1d9a", + "url": "https://api.github.com/repos/symfony/yaml/zipball/0047c8366744a16de7516622c5b7355336afae96", + "reference": "0047c8366744a16de7516622c5b7355336afae96", "shasum": "" }, "require": { @@ -1051,7 +1146,7 @@ ], "description": "Symfony Yaml Component", "homepage": "https://symfony.com", - "time": "2016-02-02 13:44:19" + "time": "2016-03-04 07:55:57" } ], "aliases": [], @@ -1059,6 +1154,8 @@ "stability-flags": [], "prefer-stable": false, "prefer-lowest": false, - "platform": [], + "platform": { + "php": ">=7.0.0" + }, "platform-dev": [] } diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..c3e2703 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,37 @@ +# PHP Machine Learning (PHP-ML) + +[![Build Status](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/build.png?b=develop)](https://scrutinizer-ci.com/g/php-ai/php-ml/build-status/develop) +[![Total Downloads](https://poser.pugx.org/php-ai/php-ml/downloads.svg)](https://packagist.org/packages/php-ai/php-ml) +[![License](https://poser.pugx.org/php-ai/php-ml/license.svg)](https://packagist.org/packages/php-ai/php-ml) +[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/quality-score.png?b=develop)](https://scrutinizer-ci.com/g/php-ai/php-ml/?branch=develop) + +Fresh approach to machine learning in PHP. Note that at the moment PHP is not the best choice for machine learning but maybe this will change ... + +## Installation + +Currently this library is in the process of developing, but You can install it with Composer: + +``` +composer require php-ai/php-ml +``` + +## To-Do + +* implements more algorithms +* integration with Lavacharts for data visualization + +## Testing + +After installation, you can launch the test suite in project root directory (you will need to install dev requirements with composer) + +``` +bin/phpunit +``` + +## License + +PHP-ML is released under the MIT Licence. See the bundled LICENSE file for details. + +## Author + +Arkadiusz Kondas (@ArkadiuszKondas) \ No newline at end of file diff --git a/docs/machine-learning/classification/knearestneighbors.md b/docs/machine-learning/classification/knearestneighbors.md new file mode 100644 index 0000000..569c48b --- /dev/null +++ b/docs/machine-learning/classification/knearestneighbors.md @@ -0,0 +1,35 @@ +# KNearestNeighbors Classifier + +Classifier implementing the k-nearest neighbors algorithm. + +### Constructor Parameters + +* $k - number of nearest neighbors to scan (default: 3) + +``` +$classifier = new KNearestNeighbors($k=4); +``` + +### Train + +To train a classifier simply provide train samples and labels (as `array`): + +``` +$samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]]; +$labels = ['a', 'a', 'a', 'b', 'b', 'b']; + +$classifier = new KNearestNeighbors(); +$classifier->train($samples, $labels); +``` + +### Predict + +To predict sample class use `predict` method. You can provide one sample or array of samples: + +``` +$classifier->predict([3, 2]); +// return 'b' + +$classifier->predict([[3, 2], [1, 5]]); +// return ['b', 'a'] +``` diff --git a/docs/machine-learning/cross-validation/randomsplit.md b/docs/machine-learning/cross-validation/randomsplit.md new file mode 100644 index 0000000..464f0db --- /dev/null +++ b/docs/machine-learning/cross-validation/randomsplit.md @@ -0,0 +1,29 @@ +# RandomSplit + +One of the simplest methods from Cross-validation is implemented as `RandomSpilt` class. Samples are split to two groups: train group and test group. You can adjust number of samples in each group. + +### Constructor Parameters + +* $dataset - object that implements `Dataset` interface +* $testSize - a fraction of test split (float, from 0 to 1, default: 0.3) +* $seed - seed for random generator (for tests) + +``` +$randomSplit = new RandomSplit($dataset, 0.2); +``` + +### Samples and labels groups + +To get samples or labels from test and train group you can use getters: + +``` +$dataset = new RandomSplit($dataset, 0.3, 1234); + +// train group +$dataset->getTrainSamples(); +$dataset->getTrainLabels(); + +// test group +$dataset->getTestSamples(); +$dataset->getTestLabels(); +``` diff --git a/docs/machine-learning/datasets/array-dataset.md b/docs/machine-learning/datasets/array-dataset.md new file mode 100644 index 0000000..5081ed8 --- /dev/null +++ b/docs/machine-learning/datasets/array-dataset.md @@ -0,0 +1,21 @@ +# ArrayDataset + +Helper class that holds data as PHP `array` type. Implements the `Dataset` interface which is used heavily in other classes. + +### Constructors Parameters + +* $samples - (array) of samples +* $labels - (array) of labels + +``` +$dataset = new ArrayDataset([[1, 1], [2, 1], [3, 2], [4, 1]], ['a', 'a', 'b', 'b']); +``` + +### Samples and labels + +To get samples or labels you can use getters: + +``` +$dataset->getSamples(); +$dataset->getLabels(); +``` diff --git a/docs/machine-learning/datasets/csv-dataset.md b/docs/machine-learning/datasets/csv-dataset.md new file mode 100644 index 0000000..553bc60 --- /dev/null +++ b/docs/machine-learning/datasets/csv-dataset.md @@ -0,0 +1,15 @@ +# CsvDataset + +Helper class that loads data from CSV file. It extends the `ArrayDataset`. + +### Constructors Parameters + +* $filepath - (string) path to `.csv` file +* $features - (int) number of columns that are features (starts from first column), last column must be a label +* $headingRow - (bool) define is file have a heading row (if `true` then first row will be ignored) + +``` +$dataset = new CsvDataset('dataset.csv', 2, true); +``` + +See Array Dataset for more information. diff --git a/docs/machine-learning/datasets/demo/iris.md b/docs/machine-learning/datasets/demo/iris.md new file mode 100644 index 0000000..9e00d5c --- /dev/null +++ b/docs/machine-learning/datasets/demo/iris.md @@ -0,0 +1,34 @@ +# Iris Dataset + +Most popular and widely available dataset of iris flower measurement and class names. + +### Specification + +| Classes | 3 | +| Samples per class | 50 | +| Samples total | 150 | +| Features per sample | 4 | + +### Load + +To load Iris dataset simple use: + +``` +$dataset = new Iris(); +``` + +### Several samples + +``` +sepal length,sepal width,petal length,petal width,class +5.1,3.5,1.4,0.2,Iris-setosa +4.9,3.0,1.4,0.2,Iris-setosa +4.7,3.2,1.3,0.2,Iris-setosa +7.0,3.2,4.7,1.4,Iris-versicolor +6.4,3.2,4.5,1.5,Iris-versicolor +6.9,3.1,4.9,1.5,Iris-versicolor +6.3,3.3,6.0,2.5,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +7.1,3.0,5.9,2.1,Iris-virginica +6.3,2.9,5.6,1.8,Iris-virginicacs +``` diff --git a/docs/machine-learning/metric/accuracy.md b/docs/machine-learning/metric/accuracy.md new file mode 100644 index 0000000..b8ec70a --- /dev/null +++ b/docs/machine-learning/metric/accuracy.md @@ -0,0 +1,24 @@ +# Accuracy + +Class for calculate classifier accuracy. + +### Score + +To calculate classifier accuracy score use `score` static method. Parametrs: + +* $actualLabels - (array) true sample labels +* $predictedLabels - (array) predicted labels (e.x. from test group) +* $normalize - (bool) normalize or not the result (default: true) + +### Example + +``` +$actualLabels = ['a', 'b', 'a', 'b']; +$predictedLabels = ['a', 'a', 'a', 'b']; + +Accuracy::score($actualLabels, $predictedLabels); +// return 0.75 + +Accuracy::score($actualLabels, $predictedLabels, false); +// return 3 +``` diff --git a/docs/machine-learning/metric/distance.md b/docs/machine-learning/metric/distance.md new file mode 100644 index 0000000..de8bcb1 --- /dev/null +++ b/docs/machine-learning/metric/distance.md @@ -0,0 +1,17 @@ +# Distance + +Special class for calculation of different types of distance. + +### Euclidean + +![euclidean](https://upload.wikimedia.org/math/8/4/9/849f040fd10bb86f7c85eb0bbe3566a4.png "Euclidean Distance") + +To calculate euclidean distance: + +``` +$a = [4, 6]; +$b = [2, 5]; + +Distance::euclidean($a, $b); +// return 2.2360679774998 +``` diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..f20036f --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,17 @@ +site_name: PHP Machine Learning (PHP-ML) +pages: + - Home: index.md + - Machine Learning: + - Classification: + - KNearestNeighbors: machine-learning/classification/knearestneighbors.md + - Cross Validation: + - RandomSplit: machine-learning/cross-validation/randomsplit.md + - Datasets: + - Array Dataset: machine-learning/datasets/array-dataset.md + - CSV Dataset: machine-learning/datasets/csv-dataset.md + - Demo: + - Iris: machine-learning/datasets/demo/iris.md + - Metric: + - Accuracy: machine-learning/metric/accuracy.md + - Distance: machine-learning/metric/distance.md +theme: readthedocs \ No newline at end of file diff --git a/phpunit.xml b/phpunit.xml new file mode 100644 index 0000000..d31e033 --- /dev/null +++ b/phpunit.xml @@ -0,0 +1,14 @@ + + + + + tests/* + + ​ + diff --git a/src/Phpml/Classifier/Classifier.php b/src/Phpml/Classifier/Classifier.php index ea2bbf1..90250a9 100644 --- a/src/Phpml/Classifier/Classifier.php +++ b/src/Phpml/Classifier/Classifier.php @@ -1,20 +1,21 @@ k = $k; + $this->samples = []; + $this->labels = []; + } + + /** + * @param array $samples + * @param array $labels + */ + public function train(array $samples, array $labels) + { + $this->samples = $samples; + $this->labels = $labels; + } + + /** + * @param array $samples + * + * @return mixed + */ + public function predict(array $samples) + { + if (!is_array($samples[0])) { + $predicted = $this->predictSample($samples); + } else { + $predicted = []; + foreach ($samples as $index => $sample) { + $predicted[$index] = $this->predictSample($sample); + } + } + + return $predicted; + } + + /** + * @param array $sample + * + * @return mixed + */ + private function predictSample(array $sample) + { + $distances = $this->kNeighborsDistances($sample); + + $predictions = array_combine(array_values($this->labels), array_fill(0, count($this->labels), 0)); + + foreach ($distances as $index => $distance) { + ++$predictions[$this->labels[$index]]; + } + + arsort($predictions); + reset($predictions); + + return key($predictions); + } + + /** + * @param array $sample + * + * @return array + * + * @throws \Phpml\Exception\InvalidArgumentException + */ + private function kNeighborsDistances(array $sample): array + { + $distances = []; + + foreach ($this->samples as $index => $neighbor) { + $distances[$index] = Distance::euclidean($sample, $neighbor); + } + + asort($distances); + + return array_slice($distances, 0, $this->k, true); + } +} diff --git a/src/Phpml/Classifier/NaiveBayes.php b/src/Phpml/Classifier/NaiveBayes.php index e852645..7324d79 100644 --- a/src/Phpml/Classifier/NaiveBayes.php +++ b/src/Phpml/Classifier/NaiveBayes.php @@ -1,26 +1,25 @@ = $testSize || 1 <= $testSize) { + throw InvalidArgumentException::percentNotInRange('testSize'); + } + $this->seedGenerator($seed); + + $samples = $dataset->getSamples(); + $labels = $dataset->getLabels(); + $datasetSize = count($samples); + + for ($i = $datasetSize; $i > 0; --$i) { + $key = mt_rand(0, $datasetSize - 1); + $setName = count($this->testSamples) / $datasetSize >= $testSize ? 'train' : 'test'; + + $this->{$setName.'Samples'}[] = $samples[$key]; + $this->{$setName.'Labels'}[] = $labels[$key]; + + $samples = array_values($samples); + $labels = array_values($labels); + } + } + + /** + * @return array + */ + public function getTrainSamples() + { + return $this->trainSamples; + } + + /** + * @return array + */ + public function getTestSamples() + { + return $this->testSamples; + } + + /** + * @return array + */ + public function getTrainLabels() + { + return $this->trainLabels; + } + + /** + * @return array + */ + public function getTestLabels() + { + return $this->testLabels; + } + + /** + * @param int|null $seed + */ + private function seedGenerator(int $seed = null) + { + if (null === $seed) { + mt_srand(); + } else { + mt_srand($seed); + } + } +} diff --git a/src/Phpml/Dataset/ArrayDataset.php b/src/Phpml/Dataset/ArrayDataset.php new file mode 100644 index 0000000..d117122 --- /dev/null +++ b/src/Phpml/Dataset/ArrayDataset.php @@ -0,0 +1,52 @@ +samples = $samples; + $this->labels = $labels; + } + + /** + * @return array + */ + public function getSamples(): array + { + return $this->samples; + } + + /** + * @return array + */ + public function getLabels(): array + { + return $this->labels; + } +} diff --git a/src/Phpml/Dataset/CsvDataset.php b/src/Phpml/Dataset/CsvDataset.php new file mode 100644 index 0000000..e6dafd2 --- /dev/null +++ b/src/Phpml/Dataset/CsvDataset.php @@ -0,0 +1,44 @@ +samples[] = array_slice($data, 0, $features); + $this->labels[] = $data[$features]; + } + fclose($handle); + } else { + throw DatasetException::cantOpenFile(basename($filepath)); + } + } +} diff --git a/src/Phpml/Dataset/Dataset.php b/src/Phpml/Dataset/Dataset.php index 4c2378a..4e04931 100644 --- a/src/Phpml/Dataset/Dataset.php +++ b/src/Phpml/Dataset/Dataset.php @@ -1,10 +1,18 @@ $label) { + if ($label === $predictedLabels[$index]) { + ++$score; + } + } + + if ($normalize) { + $score = $score / count($actualLabels); + } + + return $score; + } +} diff --git a/src/Phpml/Metric/Distance.php b/src/Phpml/Metric/Distance.php new file mode 100644 index 0000000..1b92cef --- /dev/null +++ b/src/Phpml/Metric/Distance.php @@ -0,0 +1,34 @@ +train($samples, $labels); + + $this->assertEquals('b', $classifier->predict([3, 2])); + $this->assertEquals('b', $classifier->predict([5, 1])); + $this->assertEquals('b', $classifier->predict([4, 3])); + $this->assertEquals('b', $classifier->predict([4, -5])); + + $this->assertEquals('a', $classifier->predict([2, 3])); + $this->assertEquals('a', $classifier->predict([1, 2])); + $this->assertEquals('a', $classifier->predict([1, 5])); + $this->assertEquals('a', $classifier->predict([3, 10])); + } + + public function testPredictArrayOfSamples() + { + $trainSamples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]]; + $trainLabels = ['a', 'a', 'a', 'b', 'b', 'b']; + + $testSamples = [[3, 2], [5, 1], [4, 3], [4, -5], [2, 3], [1, 2], [1, 5], [3, 10]]; + $testLabels = ['b', 'b', 'b', 'b', 'a', 'a', 'a', 'a']; + + $classifier = new KNearestNeighbors(); + $classifier->train($trainSamples, $trainLabels); + $predicted = $classifier->predict($testSamples); + + $this->assertEquals($testLabels, $predicted); + } + + public function testAccuracyOnIrisDataset() + { + $dataset = new RandomSplit(new Iris(), $testSize = 0.5, $seed = 123); + $classifier = new KNearestNeighbors($k = 4); + $classifier->train($dataset->getTrainSamples(), $dataset->getTrainLabels()); + $predicted = $classifier->predict($dataset->getTestSamples()); + $score = Accuracy::score($dataset->getTestLabels(), $predicted); + + $this->assertEquals(0.96, $score); + } +} diff --git a/tests/Phpml/CrossValidation/RandomSplitTest.php b/tests/Phpml/CrossValidation/RandomSplitTest.php new file mode 100644 index 0000000..d31c6a6 --- /dev/null +++ b/tests/Phpml/CrossValidation/RandomSplitTest.php @@ -0,0 +1,94 @@ +assertEquals(2, count($randomSplit->getTestSamples())); + $this->assertEquals(2, count($randomSplit->getTrainSamples())); + + $randomSplit2 = new RandomSplit($dataset, 0.25); + + $this->assertEquals(1, count($randomSplit2->getTestSamples())); + $this->assertEquals(3, count($randomSplit2->getTrainSamples())); + } + + public function testDatasetRandomSplitWithSameSeed() + { + $dataset = new ArrayDataset( + $samples = [[1], [2], [3], [4], [5], [6], [7], [8]], + $labels = ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'] + ); + + $seed = 123; + + $randomSplit1 = new RandomSplit($dataset, 0.5, $seed); + $randomSplit2 = new RandomSplit($dataset, 0.5, $seed); + + $this->assertEquals($randomSplit1->getTestLabels(), $randomSplit2->getTestLabels()); + $this->assertEquals($randomSplit1->getTestSamples(), $randomSplit2->getTestSamples()); + $this->assertEquals($randomSplit1->getTrainLabels(), $randomSplit2->getTrainLabels()); + $this->assertEquals($randomSplit1->getTrainSamples(), $randomSplit2->getTrainSamples()); + } + + public function testDatasetRandomSplitWithDifferentSeed() + { + $dataset = new ArrayDataset( + $samples = [[1], [2], [3], [4], [5], [6], [7], [8]], + $labels = ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'] + ); + + $randomSplit1 = new RandomSplit($dataset, 0.5, 4321); + $randomSplit2 = new RandomSplit($dataset, 0.5, 1234); + + $this->assertNotEquals($randomSplit1->getTestLabels(), $randomSplit2->getTestLabels()); + $this->assertNotEquals($randomSplit1->getTestSamples(), $randomSplit2->getTestSamples()); + $this->assertNotEquals($randomSplit1->getTrainLabels(), $randomSplit2->getTrainLabels()); + $this->assertNotEquals($randomSplit1->getTrainSamples(), $randomSplit2->getTrainSamples()); + } + + public function testRandomSplitCorrectSampleAndLabelPosition() + { + $dataset = new ArrayDataset( + $samples = [[1], [2], [3], [4]], + $labels = [1, 2, 3, 4] + ); + + $randomSplit = new RandomSplit($dataset, 0.5); + + $this->assertEquals($randomSplit->getTestSamples()[0][0], $randomSplit->getTestLabels()[0]); + $this->assertEquals($randomSplit->getTestSamples()[1][0], $randomSplit->getTestLabels()[1]); + $this->assertEquals($randomSplit->getTrainSamples()[0][0], $randomSplit->getTrainLabels()[0]); + $this->assertEquals($randomSplit->getTrainSamples()[1][0], $randomSplit->getTrainLabels()[1]); + } +} diff --git a/tests/Phpml/Dataset/ArrayDatasetTest.php b/tests/Phpml/Dataset/ArrayDatasetTest.php new file mode 100644 index 0000000..7244b3e --- /dev/null +++ b/tests/Phpml/Dataset/ArrayDatasetTest.php @@ -0,0 +1,29 @@ +assertEquals($samples, $dataset->getSamples()); + $this->assertEquals($labels, $dataset->getLabels()); + } +} diff --git a/tests/Phpml/Dataset/CsvDatasetTest.php b/tests/Phpml/Dataset/CsvDatasetTest.php new file mode 100644 index 0000000..db87d62 --- /dev/null +++ b/tests/Phpml/Dataset/CsvDatasetTest.php @@ -0,0 +1,28 @@ +assertEquals(10, count($dataset->getSamples())); + $this->assertEquals(10, count($dataset->getLabels())); + } +} diff --git a/tests/Phpml/Dataset/Demo/IrisTest.php b/tests/Phpml/Dataset/Demo/IrisTest.php new file mode 100644 index 0000000..1f0da90 --- /dev/null +++ b/tests/Phpml/Dataset/Demo/IrisTest.php @@ -0,0 +1,22 @@ +assertEquals(150, count($iris->getSamples())); + $this->assertEquals(150, count($iris->getLabels())); + + // one sample features count + $this->assertEquals(4, count($iris->getSamples()[0])); + } +} diff --git a/tests/Phpml/Dataset/Resources/dataset.csv b/tests/Phpml/Dataset/Resources/dataset.csv new file mode 100644 index 0000000..fcf0121 --- /dev/null +++ b/tests/Phpml/Dataset/Resources/dataset.csv @@ -0,0 +1,11 @@ +feature1,feature2,label +1,1,a +2,1,b +3,1,c +4,5,a +2,4,a +1,5,a +2,6,b +3,7,c +4,4,a +2,0,a \ No newline at end of file diff --git a/tests/Phpml/Metric/AccuracyTest.php b/tests/Phpml/Metric/AccuracyTest.php new file mode 100644 index 0000000..aa68b22 --- /dev/null +++ b/tests/Phpml/Metric/AccuracyTest.php @@ -0,0 +1,37 @@ +assertEquals(0.5, Accuracy::score($actualLabels, $predictedLabels)); + } + + public function testCalculateNotNormalizedScore() + { + $actualLabels = ['a', 'b', 'a', 'b']; + $predictedLabels = ['a', 'b', 'b', 'b']; + + $this->assertEquals(3, Accuracy::score($actualLabels, $predictedLabels, false)); + } +} diff --git a/tests/Phpml/Metric/DistanceTest.php b/tests/Phpml/Metric/DistanceTest.php new file mode 100644 index 0000000..b5fdc75 --- /dev/null +++ b/tests/Phpml/Metric/DistanceTest.php @@ -0,0 +1,51 @@ +assertEquals($expectedDistance, $actualDistance); + } + + public function testCalculateEuclideanDistanceForTwoAndMoreDimension() + { + $a = [4, 6]; + $b = [2, 5]; + + $expectedDistance = 2.2360679774998; + $actualDistance = Distance::euclidean($a, $b); + + $this->assertEquals($expectedDistance, $actualDistance); + + $a = [6, 10, 3]; + $b = [2, 5, 5]; + + $expectedDistance = 6.7082039324993694; + $actualDistance = Distance::euclidean($a, $b); + + $this->assertEquals($expectedDistance, $actualDistance); + } +} diff --git a/tools/php-cs-fixer.sh b/tools/php-cs-fixer.sh new file mode 100755 index 0000000..dbf66e4 --- /dev/null +++ b/tools/php-cs-fixer.sh @@ -0,0 +1,6 @@ +#!/bin/bash +echo "Fixing src/ folder" +php-cs-fixer fix src/ --level=symfony + +echo "Fixing tests/ folder" +php-cs-fixer fix tests/ --level=symfony