Merge pull request #1 from Bimsuru/master

new changes
This commit is contained in:
Jorge Casas 2020-10-02 08:37:16 +02:00 committed by GitHub
commit e1725096d4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
423 changed files with 26909 additions and 2862 deletions

9
.editorconfig Normal file
View File

@ -0,0 +1,9 @@
root = true
[*]
end_of_line = lf
charset = utf-8
max_line_length = 80
indent_style = space
indent_size = 4
insert_final_newline = true

14
.gitattributes vendored Normal file
View File

@ -0,0 +1,14 @@
* text=auto
/docs export-ignore
/tests export-ignore
/.gitattributes export-ignore
/.gitignore export-ignore
/.travis.yml export-ignore
/ecs.yml export-ignore
/CHANGELOG.md export-ignore
/CONTRIBUTING.md export-ignore
/mkdocs.yml export-ignore
/phpbench.json export-ignore
/phpstan.neon export-ignore
/phpunit.xml export-ignore

4
.gitignore vendored
View File

@ -1 +1,5 @@
/vendor/ /vendor/
/build
/tests/Performance/Data/*.csv
.php_cs.cache
.phpunit.result.cache

View File

@ -1,5 +1,38 @@
language: php language: php
php:
- '7.0' matrix:
before_script: composer install fast_finish: true
script: bin/phpunit
include:
- os: linux
php: '7.2'
env: PHPUNIT_FLAGS="--coverage-clover build/logs/clover.xml" DISABLE_XDEBUG="true" STATIC_ANALYSIS="true"
- os: linux
php: '7.3'
- os: linux
php: '7.4'
cache:
directories:
- $HOME/.composer/cache
before_install:
- if [[ $DISABLE_XDEBUG == "true" ]]; then phpenv config-rm xdebug.ini; fi
install:
- curl -s http://getcomposer.org/installer | php
- php composer.phar install --no-interaction --ignore-platform-reqs
script:
- vendor/bin/phpunit $PHPUNIT_FLAGS
- if [[ $STATIC_ANALYSIS != "" ]]; then composer check-cs; fi
- if [[ $STATIC_ANALYSIS != "" ]]; then composer phpstan; fi
after_success:
- |
if [[ $PHPUNIT_FLAGS != "" ]]; then
wget https://github.com/php-coveralls/php-coveralls/releases/download/v2.0.0/php-coveralls.phar
php php-coveralls.phar --verbose;
fi

151
CHANGELOG.md Normal file
View File

@ -0,0 +1,151 @@
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [0.9.0] - Unreleased
### Added
- [Preprocessing] Implement LabelEncoder
- [Preprocessing] Implement ColumnFilter
- [Preprocessing] Implement LambdaTransformer
- [Preprocessing] Implement NumberConverter
- [Preprocessing] Implement OneHotEncoder
- [Workflow] Implement FeatureUnion
- [Metric] Add Regression metrics: meanSquaredError, meanSquaredLogarithmicError, meanAbsoluteError, medianAbsoluteError, r2Score, maxError
- [Regression] Implement DecisionTreeRegressor
## [0.8.0] - 2019-03-20
### Added
- [Tokenization] Added NGramTokenizer (#350)
- editorconfig file (#355)
### Fixed
- [Dataset] FilesDataset read samples without additional array (#363)
- [Tokenization] fixed error with numeric token values (#363)
### Changed
- [Math] improved performance with pow and sqrt replacement (#350)
- [Math] reduce duplicated code in distance metrics (#348)
- update phpunit to 7.5.1 (#335)
- code style fixes (#334)
## [0.7.0] - 2018-11-07
### Added
- [Clustering] added KMeans associative clustering (#262)
- [Dataset] added removeColumns function to ArrayDataset (#249)
- [Dataset] added a SvmDataset class for SVM-Light (or LibSVM) format files (#237)
- [Dataset] added Mnist Dataset for MNIST file format (#326)
- [Internal] Add performance test for LeastSquares (#263)
### Changed
- [Internal] implement Keep a Changelog format
- [Classification] changed the default kernel type in SVC to Kernel::RBF (#267)
- [Optimizer] removed $initialTheta property and renamed setInitialTheta method to setTheta (#252)
- [Imputer] Throw exception when trying to transform without train data (#314)
- [Math] Micro optimization for matrix multiplication (#255)
- [Internal] Throw proper exception (#259, #251)
- [MLPClassifier] return labels in output (#315)
- [Internal] Update phpstan to 0.10.5 (#320)
### Fixed
- [SVM] ensure DataTransformer::testSet samples array is not empty (#204)
- [Optimizer] optimizer initial theta randomization (#239)
- [Internal] travis build on osx (#281)
- [SVM] SVM locale (non-locale aware) (#288)
- [Internal] typo, tests, code styles and documentation fixes (#265, #261, #254, #253, #251, #250, #248, #245, #243, #317, #328)
- [Classification] Check if feature exist when predict target in NaiveBayes (#327)
## [0.6.2] - 2018-02-22
### Fixed
- Fix Apriori array keys (#238)
## [0.6.1] - 2018-02-18
### Fixed
- Fix KMeans and EigenvalueDecomposition (#235)
## [0.6.0] - 2018-02-16
- feature [FeatureSelection] implement SelectKBest with scoring functions (#232)
- feature [FeatureSelection] implement VarianceThreshold - simple baseline approach to feature selection. (#228)
- feature [Classification] support probability estimation in SVC (#218)
- feature [NeuralNetwork] configure an Activation Function per hidden layer (#208)
- feature [NeuralNetwork] Ability to update learningRate in MLP (#160)
- feature [Metric] Choose averaging method in classification report (#205)
- enhancement Add phpstan strict rules (#233)
- enhancement Flatten directory structure (#220)
- enhancement Update phpunit/phpunit (#219)
- enhancement Cache dependencies installed with composer on Travis (#215)
- enhancement Add support for coveralls.io (#153)
- enhancement Add phpstan and easy coding standards (#156, #168)
- enhancement Throw exception when libsvm command fails to run (#200, #202)
- enhancement Normalize composer.json and sort packages (#214, #210)
- enhancement Rewrite DBSCAN (#185)
- fix phpunit include tests path (#230)
- fix support of a rule in Apriori (#229)
- fix apriori generates an empty array as a part of the frequent item sets (#224)
- fix backpropagation random error (#157)
- fix logistic regression implementation (#169)
- fix activation functions support (#163)
- fix string representation of integer labels issue in NaiveBayes (#206)
- fix the implementation of conjugate gradient method (#184)
- typo, tests and documentation fixes (#234, #221, #181, #183, #155, #159, #165, #187, #154, #191, #203, #209, #213, #212, #211)
## [0.5.0] - 2017-11-14
- general [php] Upgrade to PHP 7.1 (#150)
- general [coding standard] fix imports order and drop unused docs typehints
- feature [NeuralNetwork] Add PReLU activation function (#128)
- feature [NeuralNetwork] Add ThresholdedReLU activation function (#129)
- feature [Dataset] Support CSV with long lines (#119)
- feature [NeuralNetwork] Neural networks partial training and persistency (#91)
- feature Add french stopwords (#92)
- feature New methods: setBinPath, setVarPath in SupportVectorMachine (#73)
- feature Linear Discrimant Analysis (LDA) (#82)
- feature Linear algebra operations, Dimensionality reduction and some other minor changes (#81)
- feature Partial training base (#78)
- feature Add delimiter option for CsvDataset (#66)
- feature LogisticRegression classifier & Optimization methods (#63)
- feature Additional training for SVR (#59)
- optimization Comparison - replace eval (#130)
- optimization Use C-style casts (#124)
- optimization Speed up DataTransformer (#122)
- bug DBSCAN fix for associative keys and array_merge performance optimization (#139)
- bug Ensure user-provided SupportVectorMachine paths are valid (#126)
- bug [DecisionTree] Fix string cast #120 (#121)
- bug fix invalid typehint for subs method (#110)
- bug Fix samples transformation in Pipeline training (#94)
- bug Fix division by 0 error during normalization (#83)
- bug Fix wrong docs references (#79)
## [0.4.0] - 2017-02-23
- feature [Classification] - Ensemble Classifiers : Bagging and RandomForest by Mustafa Karabulut
- feature [Classification] - RandomForest::getFeatureImportances() method by Mustafa Karabulut
- feature [Classification] - Linear classifiers: Perceptron, Adaline, DecisionStump by Mustafa Karabulut
- feature [Classification] - AdaBoost algorithm by Mustafa Karabulut
- bug [Math] - Check if matrix is singular doing inverse by Povilas Susinskas
- optimization - Euclidean optimization by Mustafa Karabulut
## [0.3.0] - 2017-02-04
- feature [Persistency] - ModelManager - save and restore trained models by David Monllaó
- feature [Classification] - DecisionTree implementation by Mustafa Karabulut
- feature [Clustering] - Fuzzy C Means implementation by Mustafa Karabulut
- other small fixes and code styles refactors
## [0.2.1] - 2016-11-20
- feature [Association] - Apriori algorithm implementation
- bug [Metric] - division by zero
## [0.2.0] - 2016-08-14
- feature [NeuralNetwork] - MultilayerPerceptron and Backpropagation training
## [0.1.2] - 2016-07-24
- feature [Dataset] - FilesDataset - load dataset from files (folder names as targets)
- feature [Metric] - ClassificationReport - report about trained classifier
- bug [Feature Extraction] - fix problem with token count vectorizer array order
- tests [General] - add more tests for specific conditions
## [0.1.1] - 2016-07-12
- feature [Cross Validation] Stratified Random Split - equal distribution for targets in split
- feature [General] Documentation - add missing pages (Pipeline, ConfusionMatrix and TfIdfTransformer) and fix links
## [0.1.0] - 2016-07-08
- first develop release
- base tools for Machine Learning: Algorithms, Cross Validation, Preprocessing, Feature Extraction
- bug [General] #7 - PHP-ML doesn't work on Mac

66
CONTRIBUTING.md Normal file
View File

@ -0,0 +1,66 @@
# Contributing to PHP-ML
PHP-ML is an open source project. If you'd like to contribute, please read the following text. Before I can merge your
Pull-Request here are some guidelines that you need to follow. These guidelines exist not to annoy you, but to keep the
code base clean, unified and future proof.
## Branch
You should only open pull requests against the `master` branch.
## Unit Tests
Please try to add a test for your pull-request. You can run the unit-tests by calling:
```bash
vendor/bin/phpunit
```
## Performance Tests
Before first run bootstrap script will download all necessary datasets from public repository `php-ai/php-ml-datasets`.
Time performance tests:
```bash
vendor/bin/phpbench run --report=time
```
Memory performance tests:
```bash
vendor/bin/phpbench run --report=memory
```
## Travis
GitHub automatically run your pull request through Travis CI.
If you break the tests, I cannot merge your code, so please make sure that your code is working before opening up a Pull-Request.
## Merge
Please give me time to review your pull requests. I will give my best to review everything as fast as possible, but cannot always live up to my own expectations.
## Coding Standards & Static Analysis
When contributing code to PHP-ML, you must follow its coding standards. To do that, just run:
```bash
composer fix-cs
```
[More about EasyCodingStandard](https://github.com/Symplify/EasyCodingStandard)
Code has to also pass static analysis by [PHPStan](https://github.com/phpstan/phpstan):
```bash
composer phpstan
```
## Documentation
Please update the documentation pages if necessary. You can find them in docs/.
---
Thank you very much again for your contribution!

View File

@ -1,6 +1,7 @@
The MIT License (MIT) The MIT License (MIT)
Copyright (c) 2016 Arkadiusz Kondas <arkadiusz.kondas[at]gmail> Copyright (c) 2016-2019 Arkadiusz Kondas <arkadiusz.kondas[at]gmail>
Copyright (c) 2018 Andrew DalPino
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal

117
README.md
View File

@ -1,15 +1,26 @@
# PHP Machine Learning library # PHP-ML - Machine Learning library for PHP
[![Build Status](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/build.png?b=develop)](https://scrutinizer-ci.com/g/php-ai/php-ml/build-status/develop) [![Minimum PHP Version](https://img.shields.io/badge/php-%3E%3D%207.2-8892BF.svg)](https://php.net/)
[![Documentation Status](https://readthedocs.org/projects/php-ml/badge/?version=develop)](http://php-ml.readthedocs.org/en/develop/?badge=develop) [![Latest Stable Version](https://img.shields.io/packagist/v/php-ai/php-ml.svg)](https://packagist.org/packages/php-ai/php-ml)
[![Build Status](https://travis-ci.org/php-ai/php-ml.svg?branch=master)](https://travis-ci.org/php-ai/php-ml)
[![Documentation Status](https://readthedocs.org/projects/php-ml/badge/?version=master)](http://php-ml.readthedocs.org/)
[![Total Downloads](https://poser.pugx.org/php-ai/php-ml/downloads.svg)](https://packagist.org/packages/php-ai/php-ml) [![Total Downloads](https://poser.pugx.org/php-ai/php-ml/downloads.svg)](https://packagist.org/packages/php-ai/php-ml)
[![License](https://poser.pugx.org/php-ai/php-ml/license.svg)](https://packagist.org/packages/php-ai/php-ml) [![License](https://poser.pugx.org/php-ai/php-ml/license.svg)](https://packagist.org/packages/php-ai/php-ml)
[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/quality-score.png?b=develop)](https://scrutinizer-ci.com/g/php-ai/php-ml/?branch=develop) [![Coverage Status](https://coveralls.io/repos/github/php-ai/php-ml/badge.svg?branch=master)](https://coveralls.io/github/php-ai/php-ml?branch=master)
[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/php-ai/php-ml/?branch=master)
Fresh approach to Machine Learning in PHP. Note that at the moment PHP is not the best choice for machine learning but maybe this will change ... <p align="center">
<img src="https://github.com/php-ai/php-ml/raw/master/docs/assets/php-ml-logo.png" />
</p>
Fresh approach to Machine Learning in PHP. Algorithms, Cross Validation, Neural Network, Preprocessing, Feature Extraction and much more in one library.
PHP-ML requires PHP >= 7.2.
Simple example of classification: Simple example of classification:
```php ```php
require_once __DIR__ . '/vendor/autoload.php';
use Phpml\Classification\KNearestNeighbors; use Phpml\Classification\KNearestNeighbors;
$samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]]; $samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]];
@ -18,53 +29,121 @@ $labels = ['a', 'a', 'a', 'b', 'b', 'b'];
$classifier = new KNearestNeighbors(); $classifier = new KNearestNeighbors();
$classifier->train($samples, $labels); $classifier->train($samples, $labels);
$classifier->predict([3, 2]); echo $classifier->predict([3, 2]);
// return 'b' // return 'b'
``` ```
## Awards
<a href="http://www.yegor256.com/2016/10/23/award-2017.html">
<img src="http://www.yegor256.com/images/award/2017/winner-itcraftsmanpl.png" width="400"/></a>
## Documentation ## Documentation
To find out how to use PHP-ML follow [Documentation](http://php-ml.readthedocs.org/). To find out how to use PHP-ML follow [Documentation](http://php-ml.readthedocs.org/).
## Installation ## Installation
Currently this library is in the process of developing, but You can install it with Composer: Currently this library is in the process of being developed, but You can install it with Composer:
``` ```
composer require php-ai/php-ml composer require php-ai/php-ml
``` ```
## Examples
Example scripts are available in a separate repository [php-ai/php-ml-examples](https://github.com/php-ai/php-ml-examples).
## Datasets
Public datasets are available in a separate repository [php-ai/php-ml-datasets](https://github.com/php-ai/php-ml-datasets).
## Features ## Features
* Association rule learning
* [Apriori](http://php-ml.readthedocs.io/en/latest/machine-learning/association/apriori/)
* Classification * Classification
* [SVC](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/svc/)
* [k-Nearest Neighbors](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/k-nearest-neighbors/) * [k-Nearest Neighbors](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/k-nearest-neighbors/)
* [Naive Bayes](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/naive-bayes/) * [Naive Bayes](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/naive-bayes/)
* Decision Tree (CART)
* Ensemble Algorithms
* Bagging (Bootstrap Aggregating)
* Random Forest
* AdaBoost
* Linear
* Adaline
* Decision Stump
* Perceptron
* LogisticRegression
* Regression * Regression
* [Least Squares](http://php-ml.readthedocs.io/en/latest/machine-learning/regression/least-squares/) * [Least Squares](http://php-ml.readthedocs.io/en/latest/machine-learning/regression/least-squares/)
* [SVR](http://php-ml.readthedocs.io/en/latest/machine-learning/regression/svr/)
* DecisionTreeRegressor
* Clustering * Clustering
* [k-Means](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/k-means) * [k-Means](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/k-means/)
* [DBSCAN](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/dbscan) * [DBSCAN](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/dbscan/)
* Fuzzy C-Means
* Metric
* [Accuracy](http://php-ml.readthedocs.io/en/latest/machine-learning/metric/accuracy/)
* [Confusion Matrix](http://php-ml.readthedocs.io/en/latest/machine-learning/metric/confusion-matrix/)
* [Classification Report](http://php-ml.readthedocs.io/en/latest/machine-learning/metric/classification-report/)
* Regression
* Workflow
* [Pipeline](http://php-ml.readthedocs.io/en/latest/machine-learning/workflow/pipeline)
* FeatureUnion
* Neural Network
* [Multilayer Perceptron Classifier](http://php-ml.readthedocs.io/en/latest/machine-learning/neural-network/multilayer-perceptron-classifier/)
* Cross Validation * Cross Validation
* [Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/random-split) * [Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/random-split/)
* [Stratified Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/stratified-random-split/)
* Feature Selection
* [Variance Threshold](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-selection/variance-threshold/)
* [SelectKBest](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-selection/selectkbest/)
* Preprocessing
* [Normalization](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/normalization/)
* [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values/)
* LabelEncoder
* LambdaTransformer
* NumberConverter
* ColumnFilter
* OneHotEncoder
* Feature Extraction
* [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/)
* NGramTokenizer
* WhitespaceTokenizer
* WordTokenizer
* [Tf-idf Transformer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/tf-idf-transformer/)
* Dimensionality Reduction
* PCA (Principal Component Analysis)
* Kernel PCA
* LDA (Linear Discriminant Analysis)
* Datasets * Datasets
* [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset) * [Array](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/array-dataset/)
* [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/)
* [Files](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/files-dataset/)
* [SVM](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/svm-dataset/)
* [MNIST](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/mnist-dataset.md)
* Ready to use: * Ready to use:
* [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/) * [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/)
* [Wine](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/wine/)
* [Glass](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/glass/)
* Models management
* [Persistency](http://php-ml.readthedocs.io/en/latest/machine-learning/model-manager/persistency/)
* Math * Math
* [Distance](http://php-ml.readthedocs.io/en/latest/math/distance/) * [Distance](http://php-ml.readthedocs.io/en/latest/math/distance/)
* [Matrix](http://php-ml.readthedocs.io/en/latest/math/matrix/) * [Matrix](http://php-ml.readthedocs.io/en/latest/math/matrix/)
* [Set](http://php-ml.readthedocs.io/en/latest/math/set/)
* [Statistic](http://php-ml.readthedocs.io/en/latest/math/statistic/)
* Linear Algebra
## Contribute ## Contribute
- Issue Tracker: github.com/php-ai/php-ml/issues - [Guide: CONTRIBUTING.md](https://github.com/php-ai/php-ml/blob/master/CONTRIBUTING.md)
- Source Code: github.com/php-ai/php-ml - [Issue Tracker: github.com/php-ai/php-ml](https://github.com/php-ai/php-ml/issues)
- [Source Code: github.com/php-ai/php-ml](https://github.com/php-ai/php-ml)
After installation, you can launch the test suite in project root directory (you will need to install dev requirements with Composer) You can find more about contributing in [CONTRIBUTING.md](CONTRIBUTING.md).
```
bin/phpunit
```
## License ## License

4
bin/code-coverage.sh Executable file
View File

@ -0,0 +1,4 @@
#!/bin/bash
echo "Run PHPUnit with code coverage"
bin/phpunit --coverage-html .coverage
google-chrome .coverage/index.html

BIN
bin/libsvm/svm-predict Executable file

Binary file not shown.

BIN
bin/libsvm/svm-predict-osx Executable file

Binary file not shown.

BIN
bin/libsvm/svm-predict.exe Normal file

Binary file not shown.

BIN
bin/libsvm/svm-scale Executable file

Binary file not shown.

BIN
bin/libsvm/svm-scale-osx Executable file

Binary file not shown.

BIN
bin/libsvm/svm-scale.exe Normal file

Binary file not shown.

BIN
bin/libsvm/svm-train Executable file

Binary file not shown.

BIN
bin/libsvm/svm-train-osx Executable file

Binary file not shown.

BIN
bin/libsvm/svm-train.exe Normal file

Binary file not shown.

View File

@ -1,28 +1,52 @@
{ {
"name": "php-ai/php-ml", "name": "php-ai/php-ml",
"type": "library", "type": "library",
"description": "PHP Machine Learning library", "description": "PHP-ML - Machine Learning library for PHP",
"license": "MIT", "keywords": [
"keywords": ["machine learning","pattern recognition","computational learning theory","artificial intelligence"], "machine learning",
"pattern recognition",
"neural network",
"computational learning theory",
"artificial intelligence",
"data science",
"feature extraction"
],
"homepage": "https://github.com/php-ai/php-ml", "homepage": "https://github.com/php-ai/php-ml",
"license": "MIT",
"authors": [ "authors": [
{ {
"name": "Arkadiusz Kondas", "name": "Arkadiusz Kondas",
"email": "arkadiusz.kondas@gmail.com" "email": "arkadiusz.kondas@gmail.com"
} }
], ],
"autoload": {
"psr-0": {
"Phpml": "src/"
}
},
"config": {
"bin-dir": "bin"
},
"require": { "require": {
"php": ">=7.0.0" "php": "^7.2"
}, },
"require-dev": { "require-dev": {
"phpunit/phpunit": "^5.2" "phpbench/phpbench": "^0.16.0",
"phpstan/phpstan-phpunit": "^0.12",
"phpstan/phpstan": "^0.12",
"phpstan/phpstan-strict-rules": "^0.12",
"phpunit/phpunit": "^8.0",
"symplify/easy-coding-standard": "^6.0"
},
"config": {
"preferred-install": "dist",
"sort-packages": true
},
"autoload": {
"psr-4": {
"Phpml\\": "src/"
}
},
"autoload-dev": {
"psr-4": {
"Phpml\\Tests\\": "tests/"
}
},
"scripts": {
"check-cs": "vendor/bin/ecs check src tests bin",
"fix-cs": "vendor/bin/ecs check src tests bin --fix",
"phpstan": "vendor/bin/phpstan.phar analyse src tests bin --level max --configuration phpstan.neon"
} }
} }

4332
composer.lock generated

File diff suppressed because it is too large Load Diff

BIN
docs/assets/php-ml-logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.2 KiB

View File

@ -1,15 +1,26 @@
# PHP Machine Learning library # PHP-ML - Machine Learning library for PHP
[![Build Status](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/build.png?b=develop)](https://scrutinizer-ci.com/g/php-ai/php-ml/build-status/develop) [![Minimum PHP Version](https://img.shields.io/badge/php-%3E%3D%207.2-8892BF.svg)](https://php.net/)
[![Documentation Status](https://readthedocs.org/projects/php-ml/badge/?version=develop)](http://php-ml.readthedocs.org/en/develop/?badge=develop) [![Latest Stable Version](https://img.shields.io/packagist/v/php-ai/php-ml.svg)](https://packagist.org/packages/php-ai/php-ml)
[![Build Status](https://travis-ci.org/php-ai/php-ml.svg?branch=master)](https://travis-ci.org/php-ai/php-ml)
[![Documentation Status](https://readthedocs.org/projects/php-ml/badge/?version=master)](http://php-ml.readthedocs.org/)
[![Total Downloads](https://poser.pugx.org/php-ai/php-ml/downloads.svg)](https://packagist.org/packages/php-ai/php-ml) [![Total Downloads](https://poser.pugx.org/php-ai/php-ml/downloads.svg)](https://packagist.org/packages/php-ai/php-ml)
[![License](https://poser.pugx.org/php-ai/php-ml/license.svg)](https://packagist.org/packages/php-ai/php-ml) [![License](https://poser.pugx.org/php-ai/php-ml/license.svg)](https://packagist.org/packages/php-ai/php-ml)
[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/quality-score.png?b=develop)](https://scrutinizer-ci.com/g/php-ai/php-ml/?branch=develop) [![Coverage Status](https://coveralls.io/repos/github/php-ai/php-ml/badge.svg?branch=master)](https://coveralls.io/github/php-ai/php-ml?branch=master)
[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/php-ai/php-ml/?branch=master)
Fresh approach to Machine Learning in PHP. Note that at the moment PHP is not the best choice for machine learning but maybe this will change ... <p align="center">
<img src="https://github.com/php-ai/php-ml/raw/master/docs/assets/php-ml-logo.png" />
</p>
Fresh approach to Machine Learning in PHP. Algorithms, Cross Validation, Neural Network, Preprocessing, Feature Extraction and much more in one library.
PHP-ML requires PHP >= 7.2.
Simple example of classification: Simple example of classification:
```php ```php
require_once __DIR__ . '/vendor/autoload.php';
use Phpml\Classification\KNearestNeighbors; use Phpml\Classification\KNearestNeighbors;
$samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]]; $samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]];
@ -22,54 +33,80 @@ $classifier->predict([3, 2]);
// return 'b' // return 'b'
``` ```
## Awards
<a href="http://www.yegor256.com/2016/10/23/award-2017.html">
<img src="http://www.yegor256.com/images/award/2017/winner-itcraftsmanpl.png" width="400"/></a>
## Documentation ## Documentation
To find out how to use PHP-ML follow [Documentation](http://php-ml.readthedocs.org/). To find out how to use PHP-ML follow [Documentation](http://php-ml.readthedocs.org/).
## Installation ## Installation
Currently this library is in the process of developing, but You can install it with Composer: This library is still in beta. However, it can be installed with Composer:
``` ```
composer require php-ai/php-ml composer require php-ai/php-ml
``` ```
## Examples
Example scripts are available in a separate repository [php-ai/php-ml-examples](https://github.com/php-ai/php-ml-examples).
## Features ## Features
* Association rule Learning
* [Apriori](machine-learning/association/apriori.md)
* Classification * Classification
* [k-Nearest Neighbors](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/k-nearest-neighbors/) * [SVC](machine-learning/classification/svc.md)
* [Naive Bayes](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/naive-bayes/) * [k-Nearest Neighbors](machine-learning/classification/k-nearest-neighbors.md)
* [Naive Bayes](machine-learning/classification/naive-bayes.md)
* Regression * Regression
* [Least Squares](http://php-ml.readthedocs.io/en/latest/machine-learning/regression/least-squares/) * [Least Squares](machine-learning/regression/least-squares.md)
* [SVR](machine-learning/regression/svr.md)
* Clustering * Clustering
* [k-Means](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/k-means) * [k-Means](machine-learning/clustering/k-means.md)
* [DBSCAN](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/dbscan) * [DBSCAN](machine-learning/clustering/dbscan.md)
* Metric
* [Accuracy](machine-learning/metric/accuracy.md)
* [Confusion Matrix](machine-learning/metric/confusion-matrix.md)
* [Classification Report](machine-learning/metric/classification-report.md)
* Workflow
* [Pipeline](machine-learning/workflow/pipeline)
* Neural Network
* [Multilayer Perceptron Classifier](machine-learning/neural-network/multilayer-perceptron-classifier.md)
* Cross Validation * Cross Validation
* [Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/random-split) * [Random Split](machine-learning/cross-validation/random-split.md)
* [Stratified Random Split](machine-learning/cross-validation/stratified-random-split.md)
* Feature Selection
* [Variance Threshold](machine-learning/feature-selection/variance-threshold.md)
* [SelectKBest](machine-learning/feature-selection/selectkbest.md)
* Preprocessing
* [Normalization](machine-learning/preprocessing/normalization.md)
* [Imputation missing values](machine-learning/preprocessing/imputation-missing-values.md)
* LabelEncoder
* Feature Extraction
* [Token Count Vectorizer](machine-learning/feature-extraction/token-count-vectorizer.md)
* [Tf-idf Transformer](machine-learning/feature-extraction/tf-idf-transformer.md)
* Datasets * Datasets
* [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset) * [Array](machine-learning/datasets/array-dataset.md)
* [CSV](machine-learning/datasets/csv-dataset.md)
* [Files](machine-learning/datasets/files-dataset.md)
* [SVM](machine-learning/datasets/svm-dataset.md)
* [MNIST](machine-learning/datasets/mnist-dataset.md)
* Ready to use: * Ready to use:
* [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/) * [Iris](machine-learning/datasets/demo/iris.md)
* [Wine](machine-learning/datasets/demo/wine.md)
* [Glass](machine-learning/datasets/demo/glass.md)
* Models management
* [Persistency](machine-learning/model-manager/persistency.md)
* Math * Math
* [Distance](http://php-ml.readthedocs.io/en/latest/math/distance/) * [Distance](math/distance.md)
* [Matrix](http://php-ml.readthedocs.io/en/latest/math/matrix/) * [Matrix](math/matrix.md)
* [Set](math/set.md)
* [Statistic](math/statistic.md)
## Contribute ##
- Issue Tracker: github.com/php-ai/php-ml/issues
- Source Code: github.com/php-ai/php-ml
After installation, you can launch the test suite in project root directory (you will need to install dev requirements with Composer)
```
bin/phpunit
```
## License
PHP-ML is released under the MIT Licence. See the bundled LICENSE file for details.
## Author
Arkadiusz Kondas (@ArkadiuszKondas) Arkadiusz Kondas (@ArkadiuszKondas)

View File

@ -0,0 +1,60 @@
# Apriori Associator
Association rule learning based on [Apriori algorithm](https://en.wikipedia.org/wiki/Apriori_algorithm) for frequent item set mining.
### Constructor Parameters
* $support - minimum threshold of [support](https://en.wikipedia.org/wiki/Association_rule_learning#Support), i.e. the ratio of samples which contain both X and Y for a rule "if X then Y"
* $confidence - minimum threshold of [confidence](https://en.wikipedia.org/wiki/Association_rule_learning#Confidence), i.e. the ratio of samples containing both X and Y to those containing X
```
use Phpml\Association\Apriori;
$associator = new Apriori($support = 0.5, $confidence = 0.5);
```
### Train
To train an associator, simply provide train samples and labels (as `array`). Example:
```
$samples = [['alpha', 'beta', 'epsilon'], ['alpha', 'beta', 'theta'], ['alpha', 'beta', 'epsilon'], ['alpha', 'beta', 'theta']];
$labels = [];
use Phpml\Association\Apriori;
$associator = new Apriori($support = 0.5, $confidence = 0.5);
$associator->train($samples, $labels);
```
You can train the associator using multiple data sets, predictions will be based on all the training data.
### Predict
To predict sample label use the `predict` method. You can provide one sample or array of samples:
```
$associator->predict(['alpha','theta']);
// return [['beta']]
$associator->predict([['alpha','epsilon'],['beta','theta']]);
// return [[['beta']], [['alpha']]]
```
### Associating
To get generated association rules, simply use the `rules` method.
```
$associator->getRules();
// return [['antecedent' => ['alpha', 'theta'], 'consequent' => ['beta'], 'support' => 1.0, 'confidence' => 1.0], ... ]
```
### Frequent item sets
To generate k-length frequent item sets, simply use the `apriori` method.
```
$associator->apriori();
// return [ 1 => [['alpha'], ['beta'], ['theta'], ['epsilon']], 2 => [...], ...]
```

View File

@ -2,19 +2,19 @@
Classifier implementing the k-nearest neighbors algorithm. Classifier implementing the k-nearest neighbors algorithm.
### Constructor Parameters ## Constructor Parameters
* $k - number of nearest neighbors to scan (default: 3) * $k - number of nearest neighbors to scan (default: 3)
* $distanceMetric - Distance object, default Euclidean (see [distance documentation](math/distance/)) * $distanceMetric - Distance object, default Euclidean (see [distance documentation](../../math/distance.md))
``` ```
$classifier = new KNearestNeighbors($k=4); $classifier = new KNearestNeighbors($k=4);
$classifier = new KNearestNeighbors($k=3, new Minkowski($lambda=4)); $classifier = new KNearestNeighbors($k=3, new Minkowski($lambda=4));
``` ```
### Train ## Train
To train a classifier simply provide train samples and labels (as `array`). Example: To train a classifier, simply provide train samples and labels (as `array`). Example:
``` ```
$samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]]; $samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]];
@ -24,9 +24,11 @@ $classifier = new KNearestNeighbors();
$classifier->train($samples, $labels); $classifier->train($samples, $labels);
``` ```
### Predict You can train the classifier using multiple data sets, predictions will be based on all the training data.
To predict sample label use `predict` method. You can provide one sample or array of samples: ## Predict
To predict sample label use the `predict` method. You can provide one sample or array of samples:
``` ```
$classifier->predict([3, 2]); $classifier->predict([3, 2]);

View File

@ -4,7 +4,7 @@ Classifier based on applying Bayes' theorem with strong (naive) independence ass
### Train ### Train
To train a classifier simply provide train samples and labels (as `array`). Example: To train a classifier, simply provide train samples and labels (as `array`). Example:
``` ```
$samples = [[5, 1, 1], [1, 5, 1], [1, 1, 5]]; $samples = [[5, 1, 1], [1, 5, 1], [1, 1, 5]];
@ -14,14 +14,16 @@ $classifier = new NaiveBayes();
$classifier->train($samples, $labels); $classifier->train($samples, $labels);
``` ```
You can train the classifier using multiple data sets, predictions will be based on all the training data.
### Predict ### Predict
To predict sample label use `predict` method. You can provide one sample or array of samples: To predict sample label use the `predict` method. You can provide one sample or array of samples:
``` ```
$classifier->predict([3, 1, 1]); $classifier->predict([3, 1, 1]);
// return 'a' // return 'a'
$classifier->predict([[3, 1, 1], [1, 4, 1]); $classifier->predict([[3, 1, 1], [1, 4, 1]]);
// return ['a', 'b'] // return ['a', 'b']
``` ```

View File

@ -0,0 +1,88 @@
# Support Vector Classification
Classifier implementing Support Vector Machine based on libsvm.
### Constructor Parameters
* $kernel (int) - kernel type to be used in the algorithm (default Kernel::RBF)
* $cost (float) - parameter C of C-SVC (default 1.0)
* $degree (int) - degree of the Kernel::POLYNOMIAL function (default 3)
* $gamma (float) - kernel coefficient for Kernel::RBF, Kernel::POLYNOMIAL and Kernel::SIGMOID. If gamma is null then 1/features will be used instead.
* $coef0 (float) - independent term in kernel function. It is only significant in Kernel::POLYNOMIAL and Kernel::SIGMOID (default 0.0)
* $tolerance (float) - tolerance of termination criterion (default 0.001)
* $cacheSize (int) - cache memory size in MB (default 100)
* $shrinking (bool) - whether to use the shrinking heuristics (default true)
* $probabilityEstimates (bool) - whether to enable probability estimates (default false)
```
$classifier = new SVC(Kernel::LINEAR, $cost = 1000);
$classifier = new SVC(Kernel::RBF, $cost = 1000, $degree = 3, $gamma = 6);
```
### Train
To train a classifier, simply provide train samples and labels (as `array`). Example:
```
use Phpml\Classification\SVC;
use Phpml\SupportVectorMachine\Kernel;
$samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]];
$labels = ['a', 'a', 'a', 'b', 'b', 'b'];
$classifier = new SVC(Kernel::LINEAR, $cost = 1000);
$classifier->train($samples, $labels);
```
You can train the classifier using multiple data sets, predictions will be based on all the training data.
### Predict
To predict sample label use the `predict` method. You can provide one sample or array of samples:
```
$classifier->predict([3, 2]);
// return 'b'
$classifier->predict([[3, 2], [1, 5]]);
// return ['b', 'a']
```
### Probability estimation
To predict probabilities you must build a classifier with `$probabilityEstimates` set to true. Example:
```
use Phpml\Classification\SVC;
use Phpml\SupportVectorMachine\Kernel;
$samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]];
$labels = ['a', 'a', 'a', 'b', 'b', 'b'];
$classifier = new SVC(
Kernel::LINEAR, // $kernel
1.0, // $cost
3, // $degree
null, // $gamma
0.0, // $coef0
0.001, // $tolerance
100, // $cacheSize
true, // $shrinking
true // $probabilityEstimates, set to true
);
$classifier->train($samples, $labels);
```
Then use the `predictProbability` method instead of `predict`:
```
$classifier->predictProbability([3, 2]);
// return ['a' => 0.349833, 'b' => 0.650167]
$classifier->predictProbability([[3, 2], [1, 5]]);
// return [
// ['a' => 0.349833, 'b' => 0.650167],
// ['a' => 0.922664, 'b' => 0.0773364],
// ]
```

View File

@ -7,7 +7,7 @@ It is a density-based clustering algorithm: given a set of points in some space,
* $epsilon - epsilon, maximum distance between two samples for them to be considered as in the same neighborhood * $epsilon - epsilon, maximum distance between two samples for them to be considered as in the same neighborhood
* $minSamples - number of samples in a neighborhood for a point to be considered as a core point (this includes the point itself) * $minSamples - number of samples in a neighborhood for a point to be considered as a core point (this includes the point itself)
* $distanceMetric - Distance object, default Euclidean (see [distance documentation](math/distance/)) * $distanceMetric - Distance object, default Euclidean (see [distance documentation](../../math/distance.md))
``` ```
$dbscan = new DBSCAN($epsilon = 2, $minSamples = 3); $dbscan = new DBSCAN($epsilon = 2, $minSamples = 3);
@ -16,7 +16,7 @@ $dbscan = new DBSCAN($epsilon = 2, $minSamples = 3, new Minkowski($lambda=4));
### Clustering ### Clustering
To divide the samples into clusters simply use `cluster` method. It's return the `array` of clusters with samples inside. To divide the samples into clusters, simply use the `cluster` method. It returns the `array` of clusters with samples inside.
``` ```
$samples = [[1, 1], [8, 7], [1, 2], [7, 8], [2, 1], [8, 9]]; $samples = [[1, 1], [8, 7], [1, 2], [7, 8], [2, 1], [8, 9]];

View File

@ -15,14 +15,16 @@ $kmeans = new KMeans(4, KMeans::INIT_RANDOM);
### Clustering ### Clustering
To divide the samples into clusters simply use `cluster` method. It's return the `array` of clusters with samples inside. To divide the samples into clusters, simply use the `cluster` method. It returns the `array` of clusters with samples inside.
``` ```
$samples = [[1, 1], [8, 7], [1, 2], [7, 8], [2, 1], [8, 9]]; $samples = [[1, 1], [8, 7], [1, 2], [7, 8], [2, 1], [8, 9]];
Or if you need to keep your identifiers along with yours samples you can use array keys as labels.
$samples = [ 'Label1' => [1, 1], 'Label2' => [8, 7], 'Label3' => [1, 2]];
$kmeans = new KMeans(2); $kmeans = new KMeans(2);
$kmeans->cluster($samples); $kmeans->cluster($samples);
// return [0=>[[1, 1], ...], 1=>[[8, 7], ...]] // return [0=>[[1, 1], ...], 1=>[[8, 7], ...]] or [0=>['Label1' => [1, 1], 'Label3' => [1, 2], ...], 1=>['Label2' => [8, 7], ...]]
``` ```
### Initialization methods ### Initialization methods
@ -30,8 +32,8 @@ $kmeans->cluster($samples);
#### kmeans++ (default) #### kmeans++ (default)
K-means++ method selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. K-means++ method selects initial cluster centers for k-mean clustering in a smart way to speed up convergence.
It use the DASV seeding method consists of finding good initial centroids for the clusters. It uses the DASV seeding method consists of finding good initial centroids for the clusters.
#### random #### random
Random initialization method chooses completely random centroid. It get the space boundaries to avoid placing clusters centroid too far from samples data. Random initialization method chooses completely random centroid. It gets the space boundaries to avoid placing cluster centroids too far from samples data.

View File

@ -1,12 +1,12 @@
# Random Split # Random Split
One of the simplest methods from Cross-validation is implemented as `RandomSpilt` class. Samples are split to two groups: train group and test group. You can adjust number of samples in each group. One of the simplest methods from Cross-validation is implemented as `RandomSpilt` class. Samples are split to two groups: train group and test group. You can adjust the number of samples in each group.
### Constructor Parameters ### Constructor Parameters
* $dataset - object that implements `Dataset` interface * $dataset - object that implements `Dataset` interface
* $testSize - a fraction of test split (float, from 0 to 1, default: 0.3) * $testSize - a fraction of test split (float, from 0 to 1, default: 0.3)
* $seed - seed for random generator (for tests) * $seed - seed for random generator (e.g. for tests)
``` ```
$randomSplit = new RandomSplit($dataset, 0.2); $randomSplit = new RandomSplit($dataset, 0.2);
@ -14,7 +14,7 @@ $randomSplit = new RandomSplit($dataset, 0.2);
### Samples and labels groups ### Samples and labels groups
To get samples or labels from test and train group you can use getters: To get samples or labels from test and train group, you can use getters:
``` ```
$dataset = new RandomSplit($dataset, 0.3, 1234); $dataset = new RandomSplit($dataset, 0.3, 1234);

View File

@ -0,0 +1,44 @@
# Stratified Random Split
Analogously to `RandomSpilt` class, samples are split to two groups: train group and test group.
Distribution of samples takes into account their targets and trying to divide them equally.
You can adjust the number of samples in each group.
### Constructor Parameters
* $dataset - object that implements `Dataset` interface
* $testSize - a fraction of test split (float, from 0 to 1, default: 0.3)
* $seed - seed for random generator (e.g. for tests)
```
$split = new StratifiedRandomSplit($dataset, 0.2);
```
### Samples and labels groups
To get samples or labels from test and train group, you can use getters:
```
$dataset = new StratifiedRandomSplit($dataset, 0.3, 1234);
// train group
$dataset->getTrainSamples();
$dataset->getTrainLabels();
// test group
$dataset->getTestSamples();
$dataset->getTestLabels();
```
### Example
```
$dataset = new ArrayDataset(
$samples = [[1], [2], [3], [4], [5], [6], [7], [8]],
$targets = ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b']
);
$split = new StratifiedRandomSplit($dataset, 0.5);
```
Split will have equal amounts of each target. Two of the target `a` and two of `b`.

View File

@ -2,20 +2,40 @@
Helper class that holds data as PHP `array` type. Implements the `Dataset` interface which is used heavily in other classes. Helper class that holds data as PHP `array` type. Implements the `Dataset` interface which is used heavily in other classes.
### Constructors Parameters ### Constructor Parameters
* $samples - (array) of samples * $samples - (array) of samples
* $labels - (array) of labels * $labels - (array) of labels
``` ```
use Phpml\Dataset\ArrayDataset;
$dataset = new ArrayDataset([[1, 1], [2, 1], [3, 2], [4, 1]], ['a', 'a', 'b', 'b']); $dataset = new ArrayDataset([[1, 1], [2, 1], [3, 2], [4, 1]], ['a', 'a', 'b', 'b']);
``` ```
### Samples and labels ### Samples and labels
To get samples or labels you can use getters: To get samples or labels, you can use getters:
``` ```
$dataset->getSamples(); $dataset->getSamples();
$dataset->getLabels(); $dataset->getTargets();
```
### Remove columns
You can remove columns by their index numbers, for example:
```
use Phpml\Dataset\ArrayDataset;
$dataset = new ArrayDataset(
[[1,2,3,4], [2,3,4,5], [3,4,5,6], [4,5,6,7]],
['a', 'a', 'b', 'b']
);
$dataset->removeColumns([0,2]);
// now from each sample column 0 and 2 are removed
// [[2,4], [3,5], [4,6], [5,7]]
``` ```

View File

@ -2,14 +2,14 @@
Helper class that loads data from CSV file. It extends the `ArrayDataset`. Helper class that loads data from CSV file. It extends the `ArrayDataset`.
### Constructors Parameters ### Constructor Parameters
* $filepath - (string) path to `.csv` file * $filepath - (string) path to `.csv` file
* $features - (int) number of columns that are features (starts from first column), last column must be a label * $features - (int) number of columns that are features (starts from first column), last column must be a label
* $headingRow - (bool) define is file have a heading row (if `true` then first row will be ignored) * $headingRow - (bool) define if the file has a heading row (if `true` then first row will be ignored)
``` ```
$dataset = new CsvDataset('dataset.csv', 2, true); $dataset = new CsvDataset('dataset.csv', 2, true);
``` ```
See [ArrayDataset](machine-learning/datasets/array-dataset/) for more information. See [ArrayDataset](array-dataset.md) for more information.

View File

@ -0,0 +1,42 @@
# Glass Dataset
From USA Forensic Science Service; 6 types of glass; defined in terms of their oxide content (i.e. Na, Fe, K, etc)
### Specification
| Classes | 6 |
| Samples total | 214 |
| Features per sample | 9 |
Samples per class:
* 70 float processed building windows
* 17 float processed vehicle windows
* 76 non-float processed building windows
* 13 containers
* 9 tableware
* 29 headlamps
### Load
To load Glass dataset simple use:
```
use Phpml\Dataset\Demo\GlassDataset;
$dataset = new GlassDataset();
```
### Several samples example
```
RI: refractive index,Na: Sodium,Mg: Magnesium,Al: Aluminum,Si: Silicon,K: Potassium,Ca: Calcium,Ba: Barium,Fe: Iron,type of glass
1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.00,building_windows_float_processed
1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.00,building_windows_float_processed
1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.00,building_windows_float_processed
1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.00,building_windows_float_processed
1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.00,building_windows_float_processed
1.51596,12.79,3.61,1.62,72.97,0.64,8.07,0.00,0.26,building_windows_float_processed
1.51743,13.30,3.60,1.14,73.09,0.58,8.17,0.00,0.00,building_windows_float_processed
1.51756,13.15,3.61,1.05,73.24,0.57,8.24,0.00,0.00,building_windows_float_processed
1.51918,14.04,3.58,1.37,72.08,0.56,8.30,0.00,0.00,building_windows_float_processed
```

View File

@ -14,7 +14,9 @@ Most popular and widely available dataset of iris flower measurement and class n
To load Iris dataset simple use: To load Iris dataset simple use:
``` ```
$dataset = new Iris(); use Phpml\Dataset\Demo\IrisDataset;
$dataset = new IrisDataset();
``` ```
### Several samples example ### Several samples example

View File

@ -0,0 +1,35 @@
# Wine Dataset
These data are the results of a chemical analysis of wines grown in the same region in Italy but derived from three different cultivars. The analysis determined the quantities of 13 constituents found in each of the three types of wines.
### Specification
| Classes | 3 |
| Samples per class | class 1 59; class 2 71; class 3 48 |
| Samples total | 178 |
| Features per sample | 13 |
### Load
To load Wine dataset simple use:
```
use Phpml\Dataset\Demo\WineDataset;
$dataset = new WineDataset();
```
### Several samples example
```
alcohol,malic acid,ash,alcalinity of ash,magnesium,total phenols,flavanoids,nonflavanoid phenols,proanthocyanins,color intensity,hue,OD280/OD315 of diluted wines,proline,class
14.23,1.71,2.43,15.6,127,2.8,3.06,.28,2.29,5.64,1.04,3.92,1065,1
13.2,1.78,2.14,11.2,100,2.65,2.76,.26,1.28,4.38,1.05,3.4,1050,1
13.16,2.36,2.67,18.6,101,2.8,3.24,.3,2.81,5.68,1.03,3.17,1185,1
14.37,1.95,2.5,16.8,113,3.85,3.49,.24,2.18,7.8,.86,3.45,1480,1
13.24,2.59,2.87,21,118,2.8,2.69,.39,1.82,4.32,1.04,2.93,735,1
14.2,1.76,2.45,15.2,112,3.27,3.39,.34,1.97,6.75,1.05,2.85,1450,1
14.39,1.87,2.45,14.6,96,2.5,2.52,.3,1.98,5.25,1.02,3.58,1290,1
14.06,2.15,2.61,17.6,121,2.6,2.51,.31,1.25,5.05,1.06,3.58,1295,1
14.83,1.64,2.17,14,97,2.8,2.98,.29,1.98,5.2,1.08,2.85,1045,1
```

View File

@ -0,0 +1,57 @@
# FilesDataset
Helper class that loads dataset from files. Use folder names as targets. It extends the `ArrayDataset`.
### Constructor Parameters
* $rootPath - (string) path to root folder that contains files dataset
```
use Phpml\Dataset\FilesDataset;
$dataset = new FilesDataset('path/to/data');
```
See [ArrayDataset](array-dataset.md) for more information.
### Example
Files structure:
```
data
business
001.txt
002.txt
...
entertainment
001.txt
002.txt
...
politics
001.txt
002.txt
...
sport
001.txt
002.txt
...
tech
001.txt
002.txt
...
```
Load files data with `FilesDataset`:
```
use Phpml\Dataset\FilesDataset;
$dataset = new FilesDataset('path/to/data');
$dataset->getSamples()[0][0] // content from file path/to/data/business/001.txt
$dataset->getTargets()[0] // business
$dataset->getSamples()[40][0] // content from file path/to/data/tech/001.txt
$dataset->getTargets()[0] // tech
```

View File

@ -0,0 +1,26 @@
# MnistDataset
Helper class that loads data from MNIST dataset: [http://yann.lecun.com/exdb/mnist/](http://yann.lecun.com/exdb/mnist/)
> The MNIST database of handwritten digits, available from this page, has a training set of 60,000 examples, and a test set of 10,000 examples. It is a subset of a larger set available from NIST. The digits have been size-normalized and centered in a fixed-size image.
It is a good database for people who want to try learning techniques and pattern recognition methods on real-world data while spending minimal efforts on preprocessing and formatting.
### Constructors Parameters
* $imagePath - (string) path to image file
* $labelPath - (string) path to label file
```
use Phpml\Dataset\MnistDataset;
$trainDataset = new MnistDataset('train-images-idx3-ubyte', 'train-labels-idx1-ubyte');
```
### Samples and labels
To get samples or labels, you can use getters:
```
$dataset->getSamples();
$dataset->getTargets();
```

View File

@ -0,0 +1,13 @@
# SvmDataset
Helper class that loads data from SVM-Light format file. It extends the `ArrayDataset`.
### Constructor Parameters
* $filepath - (string) path to the file
```
$dataset = new SvmDataset('dataset.svm');
```
See [ArrayDataset](array-dataset.md) for more information.

View File

@ -0,0 +1,42 @@
# Tf-idf Transformer
Tfidf, short for term frequencyinverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.
### Constructor Parameters
* $samples (array) - samples for fit tf-idf model
```
use Phpml\FeatureExtraction\TfIdfTransformer;
$samples = [
[1, 2, 4],
[0, 2, 1]
];
$transformer = new TfIdfTransformer($samples);
```
### Transformation
To transform a collection of text samples, use the `transform` method. Example:
```
use Phpml\FeatureExtraction\TfIdfTransformer;
$samples = [
[0 => 1, 1 => 1, 2 => 2, 3 => 1, 4 => 0, 5 => 0],
[0 => 1, 1 => 1, 2 => 0, 3 => 0, 4 => 2, 5 => 3],
];
$transformer = new TfIdfTransformer($samples);
$transformer->transform($samples);
/*
$samples = [
[0 => 0, 1 => 0, 2 => 0.602, 3 => 0.301, 4 => 0, 5 => 0],
[0 => 0, 1 => 0, 2 => 0, 3 => 0, 4 => 0.602, 5 => 0.903],
];
*/
```

View File

@ -0,0 +1,90 @@
# Token Count Vectorizer
Transform a collection of text samples to a vector of token counts.
### Constructor Parameters
* $tokenizer (Tokenizer) - tokenizer object (see below)
* $minDF (float) - ignore tokens that have a samples frequency strictly lower than the given threshold. This value is also called cut-off in the literature. (default 0)
```
use Phpml\FeatureExtraction\TokenCountVectorizer;
use Phpml\Tokenization\WhitespaceTokenizer;
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer());
```
### Transformation
To transform a collection of text samples, use the `transform` method. Example:
```
$samples = [
'Lorem ipsum dolor sit amet dolor',
'Mauris placerat ipsum dolor',
'Mauris diam eros fringilla diam',
];
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer());
// Build the dictionary.
$vectorizer->fit($samples);
// Transform the provided text samples into a vectorized list.
$vectorizer->transform($samples);
// return $samples = [
// [0 => 1, 1 => 1, 2 => 2, 3 => 1, 4 => 1],
// [5 => 1, 6 => 1, 1 => 1, 2 => 1],
// [5 => 1, 7 => 2, 8 => 1, 9 => 1],
//];
```
### Vocabulary
You can extract vocabulary using the `getVocabulary()` method. Example:
```
$vectorizer->getVocabulary();
// return $vocabulary = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', 'Mauris', 'placerat', 'diam', 'eros', 'fringilla'];
```
### Tokenizers
* WhitespaceTokenizer - select tokens by whitespace.
* WordTokenizer - select tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator).
* NGramTokenizer - continuous sequence of characters of the specified length. They are useful for querying languages that dont use spaces or that have long compound words, like German.
**NGramTokenizer**
The NGramTokenizer tokenizer accepts the following parameters:
`$minGram` - minimum length of characters in a gram. Defaults to 1.
`$maxGram` - maximum length of characters in a gram. Defaults to 2.
```php
use Phpml\Tokenization\NGramTokenizer;
$tokenizer = new NGramTokenizer(1, 2);
$tokenizer->tokenize('Quick Fox');
// returns ['Q', 'u', 'i', 'c', 'k', 'Qu', 'ui', 'ic', 'ck', 'F', 'o', 'x', 'Fo', 'ox']
```
**NGramWordTokenizer**
The NGramWordTokenizer tokenizer accepts the following parameters:
`$minGram` - minimum length of characters in a gram. Defaults to 1.
`$maxGram` - maximum length of characters in a gram. Defaults to 2.
```php
use Phpml\Tokenization\NGramWordTokenizer;
$tokenizer = new NGramWordTokenizer(1, 2);
$tokenizer->tokenize('very quick fox');
// returns ['very', 'quick', 'fox', 'very quick', 'quick fox']
```

View File

@ -0,0 +1,96 @@
# SelectKBest
`SelectKBest` - select features according to the k highest scores.
## Constructor Parameters
* $k (int) - number of top features to select, rest will be removed (default: 10)
* $scoringFunction (ScoringFunction) - function that takes samples and targets and returns an array with scores (default: ANOVAFValue)
```php
use Phpml\FeatureSelection\SelectKBest;
$transformer = new SelectKBest(2);
```
## Example of use
As an example we can perform feature selection on Iris dataset to retrieve only the two best features as follows:
```php
use Phpml\FeatureSelection\SelectKBest;
use Phpml\Dataset\Demo\IrisDataset;
$dataset = new IrisDataset();
$selector = new SelectKBest(2);
$selector->fit($samples = $dataset->getSamples(), $dataset->getTargets());
$selector->transform($samples);
/*
$samples[0] = [1.4, 0.2];
*/
```
## Scores
You can get an array with the calculated score for each feature.
A higher value means that a given feature is better suited for learning.
Of course, the rating depends on the scoring function used.
```
use Phpml\FeatureSelection\SelectKBest;
use Phpml\Dataset\Demo\IrisDataset;
$dataset = new IrisDataset();
$selector = new SelectKBest(2);
$selector->fit($samples = $dataset->getSamples(), $dataset->getTargets());
$selector->scores();
/*
..array(4) {
[0]=>
float(119.26450218451)
[1]=>
float(47.364461402997)
[2]=>
float(1179.0343277002)
[3]=>
float(959.32440572573)
}
*/
```
## Scoring function
Available scoring functions:
For classification:
- **ANOVAFValue**
The one-way ANOVA tests the null hypothesis that 2 or more groups have the same population mean.
The test is applied to samples from two or more groups, possibly with differing sizes.
For regression:
- **UnivariateLinearRegression**
Quick linear model for testing the effect of a single regressor, sequentially for many regressors.
This is done in 2 steps:
- 1. The cross correlation between each regressor and the target is computed, that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *std(y)).
- 2. It is converted to an F score
## Pipeline
`SelectKBest` implements `Transformer` interface so it can be used as part of pipeline:
```php
use Phpml\FeatureSelection\SelectKBest;
use Phpml\Classification\SVC;
use Phpml\FeatureExtraction\TfIdfTransformer;
use Phpml\Pipeline;
$transformers = [
new TfIdfTransformer(),
new SelectKBest(3)
];
$estimator = new SVC();
$pipeline = new Pipeline($transformers, $estimator);
```

View File

@ -0,0 +1,60 @@
# Variance Threshold
`VarianceThreshold` is a simple baseline approach to feature selection.
It removes all features whose variance doesnt meet some threshold.
By default, it removes all zero-variance features, i.e. features that have the same value in all samples.
## Constructor Parameters
* $threshold (float) - features with a variance lower than this threshold will be removed (default 0.0)
```php
use Phpml\FeatureSelection\VarianceThreshold;
$transformer = new VarianceThreshold(0.15);
```
## Example of use
As an example, suppose that we have a dataset with boolean features and
we want to remove all features that are either one or zero (on or off)
in more than 80% of the samples.
Boolean features are Bernoulli random variables, and the variance of such
variables is given by
```
Var[X] = p(1 - p)
```
so we can select using the threshold .8 * (1 - .8):
```php
use Phpml\FeatureSelection\VarianceThreshold;
$samples = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]];
$transformer = new VarianceThreshold(0.8 * (1 - 0.8));
$transformer->fit($samples);
$transformer->transform($samples);
/*
$samples = [[0, 1], [1, 0], [0, 0], [1, 1], [1, 0], [1, 1]];
*/
```
## Pipeline
`VarianceThreshold` implements `Transformer` interface so it can be used as part of pipeline:
```php
use Phpml\FeatureSelection\VarianceThreshold;
use Phpml\Classification\SVC;
use Phpml\FeatureExtraction\TfIdfTransformer;
use Phpml\Pipeline;
$transformers = [
new TfIdfTransformer(),
new VarianceThreshold(0.1)
];
$estimator = new SVC();
$pipeline = new Pipeline($transformers, $estimator);
```

View File

@ -1,10 +1,10 @@
# Accuracy # Accuracy
Class for calculate classifier accuracy. Class for calculating classifier accuracy.
### Score ### Score
To calculate classifier accuracy score use `score` static method. Parameters: To calculate classifier accuracy score, use the `score` static method. Parameters:
* $actualLabels - (array) true sample labels * $actualLabels - (array) true sample labels
* $predictedLabels - (array) predicted labels (e.x. from test group) * $predictedLabels - (array) predicted labels (e.x. from test group)

View File

@ -0,0 +1,66 @@
# Classification Report
Class for calculating main classifier metrics: precision, recall, F1 score and support.
### Report
To generate report you must provide the following parameters:
* $actualLabels - (array) true sample labels
* $predictedLabels - (array) predicted labels (e.x. from test group)
```
use Phpml\Metric\ClassificationReport;
$actualLabels = ['cat', 'ant', 'bird', 'bird', 'bird'];
$predictedLabels = ['cat', 'cat', 'bird', 'bird', 'ant'];
$report = new ClassificationReport($actualLabels, $predictedLabels);
```
Optionally you can provide the following parameter:
* $average - (int) averaging method for multi-class classification
* `ClassificationReport::MICRO_AVERAGE` = 1
* `ClassificationReport::MACRO_AVERAGE` = 2 (default)
* `ClassificationReport::WEIGHTED_AVERAGE` = 3
### Metrics
After creating the report you can draw its individual metrics:
* precision (`getPrecision()`) - fraction of retrieved instances that are relevant
* recall (`getRecall()`) - fraction of relevant instances that are retrieved
* F1 score (`getF1score()`) - measure of a test's accuracy
* support (`getSupport()`) - count of testes samples
```
$precision = $report->getPrecision();
// $precision = ['cat' => 0.5, 'ant' => 0.0, 'bird' => 1.0];
```
### Example
```
use Phpml\Metric\ClassificationReport;
$actualLabels = ['cat', 'ant', 'bird', 'bird', 'bird'];
$predictedLabels = ['cat', 'cat', 'bird', 'bird', 'ant'];
$report = new ClassificationReport($actualLabels, $predictedLabels);
$report->getPrecision();
// ['cat' => 0.5, 'ant' => 0.0, 'bird' => 1.0]
$report->getRecall();
// ['cat' => 1.0, 'ant' => 0.0, 'bird' => 0.67]
$report->getF1score();
// ['cat' => 0.67, 'ant' => 0.0, 'bird' => 0.80]
$report->getSupport();
// ['cat' => 1, 'ant' => 1, 'bird' => 3]
$report->getAverage();
// ['precision' => 0.5, 'recall' => 0.56, 'f1score' => 0.49]
```

View File

@ -0,0 +1,44 @@
# Confusion Matrix
Class for computing confusion matrix to evaluate the accuracy of a classification.
### Example (all targets)
Compute ConfusionMatrix for all targets.
```
use Phpml\Metric\ConfusionMatrix;
$actualTargets = [2, 0, 2, 2, 0, 1];
$predictedTargets = [0, 0, 2, 2, 0, 2];
$confusionMatrix = ConfusionMatrix::compute($actualTargets, $predictedTargets)
/*
$confusionMatrix = [
[2, 0, 0],
[0, 0, 1],
[1, 0, 2],
];
*/
```
### Example (chosen targets)
Compute ConfusionMatrix for chosen targets.
```
use Phpml\Metric\ConfusionMatrix;
$actualTargets = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird'];
$predictedTargets = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat'];
$confusionMatrix = ConfusionMatrix::compute($actualTargets, $predictedTargets, ['ant', 'bird'])
/*
$confusionMatrix = [
[2, 0],
[0, 0],
];
*/
```

View File

@ -0,0 +1,24 @@
# Persistency
You can save trained models for future use. Persistency across requests achieved by saving and restoring serialized estimators into files.
### Example
```
use Phpml\Classification\KNearestNeighbors;
use Phpml\ModelManager;
$samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]];
$labels = ['a', 'a', 'a', 'b', 'b', 'b'];
$classifier = new KNearestNeighbors();
$classifier->train($samples, $labels);
$filepath = '/path/to/store/the/model';
$modelManager = new ModelManager();
$modelManager->saveToFile($classifier, $filepath);
$restoredClassifier = $modelManager->restoreFromFile($filepath);
$restoredClassifier->predict([3, 2]);
// return 'b'
```

View File

@ -0,0 +1,88 @@
# MLPClassifier
A multilayer perceptron (MLP) is a feedforward artificial neural network model that maps sets of input data onto a set of appropriate outputs.
## Constructor Parameters
* $inputLayerFeatures (int) - the number of input layer features
* $hiddenLayers (array) - array with the hidden layers configuration, each value represent number of neurons in each layers
* $classes (array) - array with the different training set classes (array keys are ignored)
* $iterations (int) - number of training iterations
* $learningRate (float) - the learning rate
* $activationFunction (ActivationFunction) - neuron activation function
```
use Phpml\Classification\MLPClassifier;
$mlp = new MLPClassifier(4, [2], ['a', 'b', 'c']);
// 4 nodes in input layer, 2 nodes in first hidden layer and 3 possible labels.
```
An Activation Function may also be passed in with each individual hidden layer. Example:
```
use Phpml\NeuralNetwork\ActivationFunction\PReLU;
use Phpml\NeuralNetwork\ActivationFunction\Sigmoid;
$mlp = new MLPClassifier(4, [[2, new PReLU], [2, new Sigmoid]], ['a', 'b', 'c']);
```
Instead of configuring each hidden layer as an array, they may also be configured with Layer objects. Example:
```
use Phpml\NeuralNetwork\Layer;
use Phpml\NeuralNetwork\Node\Neuron;
$layer1 = new Layer(2, Neuron::class, new PReLU);
$layer2 = new Layer(2, Neuron::class, new Sigmoid);
$mlp = new MLPClassifier(4, [$layer1, $layer2], ['a', 'b', 'c']);
```
## Train
To train a MLP, simply provide train samples and labels (as array). Example:
```
$mlp->train(
$samples = [[1, 0, 0, 0], [0, 1, 1, 0], [1, 1, 1, 1], [0, 0, 0, 0]],
$targets = ['a', 'a', 'b', 'c']
);
```
Use partialTrain method to train in batches. Example:
```
$mlp->partialTrain(
$samples = [[1, 0, 0, 0], [0, 1, 1, 0]],
$targets = ['a', 'a']
);
$mlp->partialTrain(
$samples = [[1, 1, 1, 1], [0, 0, 0, 0]],
$targets = ['b', 'c']
);
```
You can update the learning rate between partialTrain runs:
```
$mlp->setLearningRate(0.1);
```
## Predict
To predict sample label use the `predict` method. You can provide one sample or array of samples:
```
$mlp->predict([[1, 1, 1, 1], [0, 0, 0, 0]]);
// return ['b', 'c'];
```
## Activation Functions
* BinaryStep
* Gaussian
* HyperbolicTangent
* Parametric Rectified Linear Unit
* Sigmoid (default)
* Thresholded Rectified Linear Unit

View File

@ -0,0 +1,67 @@
# Imputation missing values
For various reasons, many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders.
To solve this problem you can use the `Imputer` class.
## Constructor Parameters
* $missingValue (mixed) - this value will be replaced (default null)
* $strategy (Strategy) - imputation strategy (read to use: MeanStrategy, MedianStrategy, MostFrequentStrategy)
* $axis (int) - axis for strategy, Imputer::AXIS_COLUMN or Imputer::AXIS_ROW
* $samples (array) - array of samples to train
```
$imputer = new Imputer(null, new MeanStrategy(), Imputer::AXIS_COLUMN);
$imputer = new Imputer(null, new MedianStrategy(), Imputer::AXIS_ROW);
```
## Strategy
* MeanStrategy - replace missing values using the mean along the axis
* MedianStrategy - replace missing values using the median along the axis
* MostFrequentStrategy - replace missing using the most frequent value along the axis
## Example of use
```
use Phpml\Preprocessing\Imputer;
use Phpml\Preprocessing\Imputer\Strategy\MeanStrategy;
$data = [
[1, null, 3, 4],
[4, 3, 2, 1],
[null, 6, 7, 8],
[8, 7, null, 5],
];
$imputer = new Imputer(null, new MeanStrategy(), Imputer::AXIS_COLUMN);
$imputer->fit($data);
$imputer->transform($data);
/*
$data = [
[1, 5.33, 3, 4],
[4, 3, 2, 1],
[4.33, 6, 7, 8],
[8, 7, 4, 5],
];
*/
```
You can also use the `$samples` constructor parameter instead of the `fit` method:
```
use Phpml\Preprocessing\Imputer;
use Phpml\Preprocessing\Imputer\Strategy\MeanStrategy;
$data = [
[1, null, 3, 4],
[4, 3, 2, 1],
[null, 6, 7, 8],
[8, 7, null, 5],
];
$imputer = new Imputer(null, new MeanStrategy(), Imputer::AXIS_COLUMN, $data);
$imputer->transform($data);
```

View File

@ -0,0 +1,59 @@
# Normalization
Normalization is the process of scaling individual samples to have unit norm.
## L2 norm
[http://mathworld.wolfram.com/L2-Norm.html](http://mathworld.wolfram.com/L2-Norm.html)
Example:
```
use Phpml\Preprocessing\Normalizer;
$samples = [
[1, -1, 2],
[2, 0, 0],
[0, 1, -1],
];
$normalizer = new Normalizer();
$normalizer->preprocess($samples);
/*
$samples = [
[0.4, -0.4, 0.81],
[1.0, 0.0, 0.0],
[0.0, 0.7, -0.7],
];
*/
```
## L1 norm
[http://mathworld.wolfram.com/L1-Norm.html](http://mathworld.wolfram.com/L1-Norm.html)
Example:
```
use Phpml\Preprocessing\Normalizer;
$samples = [
[1, -1, 2],
[2, 0, 0],
[0, 1, -1],
];
$normalizer = new Normalizer(Normalizer::NORM_L1);
$normalizer->preprocess($samples);
/*
$samples = [
[0.25, -0.25, 0.5],
[1.0, 0.0, 0.0],
[0.0, 0.5, -0.5],
];
*/
```

View File

@ -1,10 +1,10 @@
# LeastSquares Linear Regression # LeastSquares Linear Regression
Linear model that use least squares method to approximate solution. Linear model that uses least squares method to approximate solution.
### Train ### Train
To train a model simply provide train samples and targets values (as `array`). Example: To train a model, simply provide train samples and targets values (as `array`). Example:
``` ```
$samples = [[60], [61], [62], [63], [65]]; $samples = [[60], [61], [62], [63], [65]];
@ -14,9 +14,11 @@ $regression = new LeastSquares();
$regression->train($samples, $targets); $regression->train($samples, $targets);
``` ```
You can train the model using multiple data sets, predictions will be based on all the training data.
### Predict ### Predict
To predict sample target value use `predict` method with sample to check (as `array`). Example: To predict sample target value, use the `predict` method with sample to check (as `array`). Example:
``` ```
$regression->predict([64]); $regression->predict([64]);
@ -26,7 +28,7 @@ $regression->predict([64]);
### Multiple Linear Regression ### Multiple Linear Regression
The term multiple attached to linear regression means that there are two or more sample parameters used to predict target. The term multiple attached to linear regression means that there are two or more sample parameters used to predict target.
For example you can use: mileage and production year to predict price of a car. For example you can use: mileage and production year to predict the price of a car.
``` ```
$samples = [[73676, 1996], [77006, 1998], [10565, 2000], [146088, 1995], [15000, 2001], [65940, 2000], [9300, 2000], [93739, 1996], [153260, 1994], [17764, 2002], [57000, 1998], [15000, 2000]]; $samples = [[73676, 1996], [77006, 1998], [10565, 2000], [146088, 1995], [15000, 2001], [65940, 2000], [9300, 2000], [93739, 1996], [153260, 1994], [17764, 2002], [57000, 1998], [15000, 2000]];
@ -40,7 +42,7 @@ $regression->predict([60000, 1996])
### Intercept and Coefficients ### Intercept and Coefficients
After you train your model you can get the intercept and coefficients array. After you train your model, you can get the intercept and coefficients array.
``` ```
$regression->getIntercept(); $regression->getIntercept();

View File

@ -0,0 +1,46 @@
# Support Vector Regression
Class implementing Epsilon-Support Vector Regression based on libsvm.
### Constructor Parameters
* $kernel (int) - kernel type to be used in the algorithm (default Kernel::RBF)
* $degree (int) - degree of the Kernel::POLYNOMIAL function (default 3)
* $epsilon (float) - epsilon in loss function of epsilon-SVR (default 0.1)
* $cost (float) - parameter C of C-SVC (default 1.0)
* $gamma (float) - kernel coefficient for Kernel::RBF, Kernel::POLYNOMIAL and Kernel::SIGMOID. If gamma is null then 1/features will be used instead.
* $coef0 (float) - independent term in kernel function. It is only significant in Kernel::POLYNOMIAL and Kernel::SIGMOID (default 0.0)
* $tolerance (float) - tolerance of termination criterion (default 0.001)
* $cacheSize (int) - cache memory size in MB (default 100)
* $shrinking (bool) - whether to use the shrinking heuristics (default true)
```
$regression = new SVR(Kernel::LINEAR);
$regression = new SVR(Kernel::LINEAR, $degree = 3, $epsilon=10.0);
```
### Train
To train a model, simply provide train samples and targets values (as `array`). Example:
```
use Phpml\Regression\SVR;
use Phpml\SupportVectorMachine\Kernel;
$samples = [[60], [61], [62], [63], [65]];
$targets = [3.1, 3.6, 3.8, 4, 4.1];
$regression = new SVR(Kernel::LINEAR);
$regression->train($samples, $targets);
```
You can train the model using multiple data sets, predictions will be based on all the training data.
### Predict
To predict sample target value, use the `predict` method. You can provide one sample or array of samples:
```
$regression->predict([64])
// return 4.03
```

View File

@ -0,0 +1,65 @@
# Pipeline
In machine learning, it is common to run a sequence of algorithms to process and learn from dataset. For example:
* Split each documents text into tokens.
* Convert each documents words into a numerical feature vector ([Token Count Vectorizer](machine-learning/feature-extraction/token-count-vectorizer/)).
* Learn a prediction model using the feature vectors and labels.
PHP-ML represents such a workflow as a Pipeline, which consists of a sequence of transformers and an estimator.
### Constructor Parameters
* $transformers (array|Transformer[]) - sequence of objects that implements the Transformer interface
* $estimator (Estimator) - estimator that can train and predict
```
use Phpml\Classification\SVC;
use Phpml\FeatureExtraction\TfIdfTransformer;
use Phpml\Pipeline;
$transformers = [
new TfIdfTransformer(),
];
$estimator = new SVC();
$pipeline = new Pipeline($transformers, $estimator);
```
### Example
First, our pipeline replaces the missing value, then normalizes samples and finally trains the SVC estimator.
Thus prepared pipeline repeats each transformation step for predicted sample.
```
use Phpml\Classification\SVC;
use Phpml\Pipeline;
use Phpml\Preprocessing\Imputer;
use Phpml\Preprocessing\Normalizer;
use Phpml\Preprocessing\Imputer\Strategy\MostFrequentStrategy;
$transformers = [
new Imputer(null, new MostFrequentStrategy()),
new Normalizer(),
];
$estimator = new SVC();
$samples = [
[1, -1, 2],
[2, 0, null],
[null, 1, -1],
];
$targets = [
4,
1,
4,
];
$pipeline = new Pipeline($transformers, $estimator);
$pipeline->train($samples, $targets);
$predicted = $pipeline->predict([[0, 0, 0]]);
// $predicted == 4
```

View File

@ -4,7 +4,7 @@ Selected algorithms require the use of a function for calculating the distance.
### Euclidean ### Euclidean
Class for calculation Euclidean distance. Class for calculating Euclidean distance.
![euclidean](https://upload.wikimedia.org/math/8/4/9/849f040fd10bb86f7c85eb0bbe3566a4.png "Euclidean Distance") ![euclidean](https://upload.wikimedia.org/math/8/4/9/849f040fd10bb86f7c85eb0bbe3566a4.png "Euclidean Distance")
@ -21,7 +21,7 @@ $euclidean->distance($a, $b);
### Manhattan ### Manhattan
Class for calculation Manhattan distance. Class for calculating Manhattan distance.
![manhattan](https://upload.wikimedia.org/math/4/c/5/4c568bd1d76a6b15e19cb2ac3ad75350.png "Manhattan Distance") ![manhattan](https://upload.wikimedia.org/math/4/c/5/4c568bd1d76a6b15e19cb2ac3ad75350.png "Manhattan Distance")
@ -38,7 +38,7 @@ $manhattan->distance($a, $b);
### Chebyshev ### Chebyshev
Class for calculation Chebyshev distance. Class for calculating Chebyshev distance.
![chebyshev](https://upload.wikimedia.org/math/7/1/2/71200f7dbb43b3bcfbcbdb9e02ab0a0c.png "Chebyshev Distance") ![chebyshev](https://upload.wikimedia.org/math/7/1/2/71200f7dbb43b3bcfbcbdb9e02ab0a0c.png "Chebyshev Distance")
@ -55,7 +55,7 @@ $chebyshev->distance($a, $b);
### Minkowski ### Minkowski
Class for calculation Minkowski distance. Class for calculating Minkowski distance.
![minkowski](https://upload.wikimedia.org/math/a/a/0/aa0c62083c12390cb15ac3217de88e66.png "Minkowski Distance") ![minkowski](https://upload.wikimedia.org/math/a/a/0/aa0c62083c12390cb15ac3217de88e66.png "Minkowski Distance")
@ -83,7 +83,7 @@ $minkowski->distance($a, $b);
### Custom distance ### Custom distance
To apply your own function of distance use `Distance` interface. Example To apply your own function of distance use the `Distance` interface. Example:
``` ```
class CustomDistance implements Distance class CustomDistance implements Distance

127
docs/math/set.md Normal file
View File

@ -0,0 +1,127 @@
# Set
Class that wraps PHP arrays containing primitive types to mathematical sets.
### Creation
To create Set use flat arrays containing primitives only:
```
use \Phpml\Math\Set;
$set = new Set([1, 2, 2, 3, 1.1, -1, -10]);
$set->toArray();
// return [-10, -1, 1, 1.1, 2, 3]
$set = new Set(['B', '', 'A']);
$set->toArray();
// return ['', 'A', 'B']
```
Injected array is sorted by SORT_ASC, duplicates are removed and index is rewritten.
### Union
Create the union of two Sets:
```
use \Phpml\Math\Set;
$union = Set::union(new Set([1, 3]), new Set([1, 2]));
$union->toArray();
//return [1, 2, 3]
```
### Intersection
Create the intersection of two Sets:
```
use \Phpml\Math\Set;
$intersection = Set::intersection(new Set(['A', 'C']), new Set(['B', 'C']));
$intersection->toArray();
//return ['C']
```
### Complement
Create the set-theoretic difference of two Sets:
```
use \Phpml\Math\Set;
$difference = Set::difference(new Set(['A', 'B', 'C']), new Set(['A']));
$union->toArray();
//return ['B', 'C']
```
### Adding elements
```
use \Phpml\Math\Set;
$set = new Set([1, 2]);
$set->addAll([3]);
$set->add(4);
$set->toArray();
//return [1, 2, 3, 4]
```
### Removing elements
```
use \Phpml\Math\Set;
$set = new Set([1, 2]);
$set->removeAll([2]);
$set->remove(1);
$set->toArray();
//return []
```
### Check membership
```
use \Phpml\Math\Set;
$set = new Set([1, 2]);
$set->containsAll([2, 3]);
//return false
$set->contains(1);
//return true
```
### Cardinality
```
use \Phpml\Math\Set;
$set = new Set([1, 2]);
$set->cardinality();
//return 2
```
### Is empty
```
use \Phpml\Math\Set;
$set = new Set();
$set->isEmpty();
//return true
```
### Working with loops
```
use \Phpml\Math\Set;
$set = new Set(['A', 'B', 'C']);
foreach($set as $element) {
echo "$element, ";
}
// echoes A, B, C
```

69
ecs.yml Normal file
View File

@ -0,0 +1,69 @@
imports:
- { resource: 'vendor/symplify/easy-coding-standard/config/set/psr2.yaml' }
- { resource: 'vendor/symplify/easy-coding-standard/config/set/php71.yaml' }
- { resource: 'vendor/symplify/easy-coding-standard/config/set/clean-code.yaml' }
- { resource: 'vendor/symplify/easy-coding-standard/config/set/common.yaml' }
services:
# spacing
PhpCsFixer\Fixer\PhpTag\BlankLineAfterOpeningTagFixer: ~
PhpCsFixer\Fixer\Whitespace\BlankLineBeforeStatementFixer: ~
PhpCsFixer\Fixer\CastNotation\CastSpacesFixer: ~
PhpCsFixer\Fixer\Operator\ConcatSpaceFixer:
spacing: none
PhpCsFixer\Fixer\ClassNotation\MethodSeparationFixer: ~
PhpCsFixer\Fixer\ClassNotation\NoBlankLinesAfterClassOpeningFixer: ~
PhpCsFixer\Fixer\Whitespace\NoSpacesAroundOffsetFixer:
positions: ['inside', 'outside']
PhpCsFixer\Fixer\Operator\BinaryOperatorSpacesFixer:
align_double_arrow: false
align_equals: false
PhpCsFixer\Fixer\PhpUnit\PhpUnitTestCaseStaticMethodCallsFixer:
call_type: 'self'
# phpdoc
PhpCsFixer\Fixer\Phpdoc\PhpdocSeparationFixer: ~
PhpCsFixer\Fixer\Phpdoc\PhpdocAlignFixer: ~
# Symplify
Symplify\CodingStandard\Fixer\Import\ImportNamespacedNameFixer: ~
Symplify\CodingStandard\Fixer\Php\ClassStringToClassConstantFixer: ~
Symplify\CodingStandard\Fixer\Property\ArrayPropertyDefaultValueFixer: ~
Symplify\CodingStandard\Fixer\ArrayNotation\StandaloneLineInMultilineArrayFixer: ~
parameters:
skip:
# from strict.neon
PhpCsFixer\Fixer\PhpUnit\PhpUnitStrictFixer: ~
PhpCsFixer\Fixer\Strict\StrictComparisonFixer: ~
# personal prefference
PhpCsFixer\Fixer\Operator\NotOperatorWithSuccessorSpaceFixer: ~
PhpCsFixer\Fixer\Alias\RandomApiMigrationFixer:
# random_int() breaks code
- 'src/CrossValidation/RandomSplit.php'
SlevomatCodingStandard\Sniffs\Classes\UnusedPrivateElementsSniff:
# magic calls
- 'src/Preprocessing/Normalizer.php'
PhpCsFixer\Fixer\StringNotation\ExplicitStringVariableFixer:
# bugged
- 'src/Classification/DecisionTree/DecisionTreeLeaf.php'
Symplify\CodingStandard\Fixer\Commenting\RemoveUselessDocBlockFixer:
# false positive - already fixed in master
- 'src/Helper/OneVsRest.php'
# bug in fixer
- 'src/Math/LinearAlgebra/LUDecomposition.php'
PhpCsFixer\Fixer\FunctionNotation\VoidReturnFixer:
# covariant return types
- 'src/Classification/Linear/Perceptron.php'
# missing typehints
SlevomatCodingStandard\Sniffs\TypeHints\TypeHintDeclarationSniff.MissingParameterTypeHint: ~
SlevomatCodingStandard\Sniffs\TypeHints\TypeHintDeclarationSniff.MissingTraversableParameterTypeHintSpecification: ~
SlevomatCodingStandard\Sniffs\TypeHints\TypeHintDeclarationSniff.MissingReturnTypeHint: ~
SlevomatCodingStandard\Sniffs\TypeHints\TypeHintDeclarationSniff.MissingTraversableReturnTypeHintSpecification: ~
SlevomatCodingStandard\Sniffs\TypeHints\TypeHintDeclarationSniff.MissingPropertyTypeHint: ~
SlevomatCodingStandard\Sniffs\TypeHints\TypeHintDeclarationSniff.MissingTraversablePropertyTypeHintSpecification: ~
# assignment in "while ($var = ...)" are ok
PHP_CodeSniffer\Standards\Generic\Sniffs\CodeAnalysis\AssignmentInConditionSniff.FoundInWhileCondition:

View File

@ -1,25 +1,54 @@
site_name: PHP Machine Learning (PHP-ML) site_name: PHP-ML - Machine Learning library for PHP
pages: pages:
- Home: index.md - Home: index.md
- Machine Learning: - Machine Learning:
- Association rule learning:
- Apriori: machine-learning/association/apriori.md
- Classification: - Classification:
- SVC: machine-learning/classification/svc.md
- KNearestNeighbors: machine-learning/classification/k-nearest-neighbors.md - KNearestNeighbors: machine-learning/classification/k-nearest-neighbors.md
- NaiveBayes: machine-learning/classification/naive-bayes.md - NaiveBayes: machine-learning/classification/naive-bayes.md
- Regression: - Regression:
- LeastSquares: machine-learning/regression/least-squares.md - LeastSquares: machine-learning/regression/least-squares.md
- SVR: machine-learning/regression/svr.md
- Clustering: - Clustering:
- KMeans: machine-learning/clustering/k-means.md - KMeans: machine-learning/clustering/k-means.md
- DBSCAN: machine-learning/clustering/dbscan.md - DBSCAN: machine-learning/clustering/dbscan.md
- Metric:
- Accuracy: machine-learning/metric/accuracy.md
- Confusion Matrix: machine-learning/metric/confusion-matrix.md
- Classification Report: machine-learning/metric/classification-report.md
- Workflow:
- Pipeline: machine-learning/workflow/pipeline.md
- Neural Network:
- Multilayer Perceptron Classifier: machine-learning/neural-network/multilayer-perceptron-classifier.md
- Cross Validation: - Cross Validation:
- RandomSplit: machine-learning/cross-validation/random-split.md - RandomSplit: machine-learning/cross-validation/random-split.md
- Stratified Random Split: machine-learning/cross-validation/stratified-random-split.md
- Feature Selection:
- VarianceThreshold: machine-learning/feature-selection/variance-threshold.md
- SelectKBest: machine-learning/feature-selection/selectkbest.md
- Preprocessing:
- Normalization: machine-learning/preprocessing/normalization.md
- Imputation missing values: machine-learning/preprocessing/imputation-missing-values.md
- Feature Extraction:
- Token Count Vectorizer: machine-learning/feature-extraction/token-count-vectorizer.md
- Tf-idf Transformer: machine-learning/feature-extraction/tf-idf-transformer.md
- Datasets: - Datasets:
- Array Dataset: machine-learning/datasets/array-dataset.md - Array Dataset: machine-learning/datasets/array-dataset.md
- CSV Dataset: machine-learning/datasets/csv-dataset.md - CSV Dataset: machine-learning/datasets/csv-dataset.md
- Files Dataset: machine-learning/datasets/files-dataset.md
- SVM Dataset: machine-learning/datasets/svm-dataset.md
- MNIST Dataset: machine-learning/datasets/mnist-dataset.md
- Ready to use datasets: - Ready to use datasets:
- Iris: machine-learning/datasets/demo/iris.md - Iris: machine-learning/datasets/demo/iris.md
- Metric: - Wine: machine-learning/datasets/demo/wine.md
- Accuracy: machine-learning/metric/accuracy.md - Glass: machine-learning/datasets/demo/glass.md
- Models management:
- Persistency: machine-learning/model-manager/persistency.md
- Math: - Math:
- Distance: math/distance.md - Distance: math/distance.md
- Matrix: math/matrix.md - Matrix: math/matrix.md
- Set: math/set.md
- Statistic: math/statistic.md
theme: readthedocs theme: readthedocs

17
phpbench.json Normal file
View File

@ -0,0 +1,17 @@
{
"bootstrap": "tests/Performance/bootstrap.php",
"path": "tests/Performance",
"reports": {
"time": {
"extends": "aggregate",
"title": "The Consumation of Time",
"cols": [ "subject", "mode", "mean", "rstdev", "diff"]
},
"memory": {
"extends": "aggregate",
"title": "The Memory Usage",
"cols": [ "subject", "mem_real", "mem_final", "mem_peak", "diff"],
"diff_col": "mem_peak"
}
}
}

21
phpstan.neon Normal file
View File

@ -0,0 +1,21 @@
includes:
- vendor/phpstan/phpstan-strict-rules/rules.neon
- vendor/phpstan/phpstan-phpunit/extension.neon
- vendor/phpstan/phpstan-phpunit/rules.neon
parameters:
checkGenericClassInNonGenericObjectType: false
checkMissingIterableValueType: false
ignoreErrors:
- '#Property Phpml\\Clustering\\KMeans\\Cluster\:\:\$points \(iterable\<Phpml\\Clustering\\KMeans\\Point\>\&SplObjectStorage\) does not accept SplObjectStorage#'
- '#Phpml\\Dataset\\(.*)Dataset::__construct\(\) does not call parent constructor from Phpml\\Dataset\\ArrayDataset#'
- '#Variable property access on .+#'
- '#Variable method call on .+#'
- message: '#ReflectionClass#'
paths:
- src/Classification/Ensemble/AdaBoost.php
- src/Classification/Ensemble/Bagging.php
# probably known value
- '#Method Phpml\\Classification\\DecisionTree::getBestSplit\(\) should return Phpml\\Classification\\DecisionTree\\DecisionTreeLeaf but returns Phpml\\Classification\\DecisionTree\\DecisionTreeLeaf\|null#'
- '#Call to an undefined method Phpml\\Helper\\Optimizer\\Optimizer::getCostValues\(\)#'

View File

@ -1,14 +1,24 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<phpunit <phpunit
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="vendor/phpunit/phpunit/phpunit.xsd"
colors="true" colors="true"
beStrictAboutTestsThatDoNotTestAnything="true" beStrictAboutTestsThatDoNotTestAnything="true"
beStrictAboutOutputDuringTests="true" beStrictAboutOutputDuringTests="true"
beStrictAboutTestSize="true"
beStrictAboutChangesToGlobalState="true" beStrictAboutChangesToGlobalState="true"
> >
<testsuites>
<testsuite name="PHP-ML Test Suite"> <testsuite name="PHP-ML Test Suite">
<directory>tests/*</directory> <directory>tests</directory>
</testsuite> </testsuite>
</testsuites>
<filter>
<whitelist processUncoveredFilesFromWhitelist="true">
<directory suffix=".php">src</directory>
</whitelist>
</filter>
<php>
<ini name="error_reporting" value="E_ALL" />
</php>
</phpunit> </phpunit>

332
src/Association/Apriori.php Normal file
View File

@ -0,0 +1,332 @@
<?php
declare(strict_types=1);
namespace Phpml\Association;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
class Apriori implements Associator
{
use Trainable;
use Predictable;
public const ARRAY_KEY_ANTECEDENT = 'antecedent';
public const ARRAY_KEY_CONFIDENCE = 'confidence';
public const ARRAY_KEY_CONSEQUENT = 'consequent';
public const ARRAY_KEY_SUPPORT = 'support';
/**
* Minimum relative probability of frequent transactions.
*
* @var float
*/
private $confidence;
/**
* The large set contains frequent k-length item sets.
*
* @var mixed[][][]
*/
private $large = [];
/**
* Minimum relative frequency of transactions.
*
* @var float
*/
private $support;
/**
* The generated Apriori association rules.
*
* @var mixed[][]
*/
private $rules = [];
/**
* Apriori constructor.
*/
public function __construct(float $support = 0.0, float $confidence = 0.0)
{
$this->support = $support;
$this->confidence = $confidence;
}
/**
* Get all association rules which are generated for every k-length frequent item set.
*
* @return mixed[][]
*/
public function getRules(): array
{
if (count($this->large) === 0) {
$this->large = $this->apriori();
}
if (count($this->rules) > 0) {
return $this->rules;
}
$this->rules = [];
$this->generateAllRules();
return $this->rules;
}
/**
* Generates frequent item sets.
*
* @return mixed[][][]
*/
public function apriori(): array
{
$L = [];
$items = $this->frequent($this->items());
for ($k = 1; isset($items[0]); ++$k) {
$L[$k] = $items;
$items = $this->frequent($this->candidates($items));
}
return $L;
}
/**
* @param mixed[] $sample
*
* @return mixed[][]
*/
protected function predictSample(array $sample): array
{
$predicts = array_values(array_filter($this->getRules(), function ($rule) use ($sample): bool {
return $this->equals($rule[self::ARRAY_KEY_ANTECEDENT], $sample);
}));
return array_map(static function ($rule) {
return $rule[self::ARRAY_KEY_CONSEQUENT];
}, $predicts);
}
/**
* Generate rules for each k-length frequent item set.
*/
private function generateAllRules(): void
{
for ($k = 2; isset($this->large[$k]); ++$k) {
foreach ($this->large[$k] as $frequent) {
$this->generateRules($frequent);
}
}
}
/**
* Generate confident rules for frequent item set.
*
* @param mixed[] $frequent
*/
private function generateRules(array $frequent): void
{
foreach ($this->antecedents($frequent) as $antecedent) {
$confidence = $this->confidence($frequent, $antecedent);
if ($this->confidence <= $confidence) {
$consequent = array_values(array_diff($frequent, $antecedent));
$this->rules[] = [
self::ARRAY_KEY_ANTECEDENT => $antecedent,
self::ARRAY_KEY_CONSEQUENT => $consequent,
self::ARRAY_KEY_SUPPORT => $this->support($frequent),
self::ARRAY_KEY_CONFIDENCE => $confidence,
];
}
}
}
/**
* Generates the power set for given item set $sample.
*
* @param mixed[] $sample
*
* @return mixed[][]
*/
private function powerSet(array $sample): array
{
$results = [[]];
foreach ($sample as $item) {
foreach ($results as $combination) {
$results[] = array_merge([$item], $combination);
}
}
return $results;
}
/**
* Generates all proper subsets for given set $sample without the empty set.
*
* @param mixed[] $sample
*
* @return mixed[][]
*/
private function antecedents(array $sample): array
{
$cardinality = count($sample);
$antecedents = $this->powerSet($sample);
return array_filter($antecedents, static function ($antecedent) use ($cardinality): bool {
return (count($antecedent) != $cardinality) && ($antecedent != []);
});
}
/**
* Calculates frequent k = 1 item sets.
*
* @return mixed[][]
*/
private function items(): array
{
$items = [];
foreach ($this->samples as $sample) {
foreach ($sample as $item) {
if (!in_array($item, $items, true)) {
$items[] = $item;
}
}
}
return array_map(static function ($entry): array {
return [$entry];
}, $items);
}
/**
* Returns frequent item sets only.
*
* @param mixed[][] $samples
*
* @return mixed[][]
*/
private function frequent(array $samples): array
{
return array_values(array_filter($samples, function ($entry): bool {
return $this->support($entry) >= $this->support;
}));
}
/**
* Calculates frequent k item sets, where count($samples) == $k - 1.
*
* @param mixed[][] $samples
*
* @return mixed[][]
*/
private function candidates(array $samples): array
{
$candidates = [];
foreach ($samples as $p) {
foreach ($samples as $q) {
if (count(array_merge(array_diff($p, $q), array_diff($q, $p))) != 2) {
continue;
}
$candidate = array_values(array_unique(array_merge($p, $q)));
if ($this->contains($candidates, $candidate)) {
continue;
}
foreach ($this->samples as $sample) {
if ($this->subset($sample, $candidate)) {
$candidates[] = $candidate;
continue 2;
}
}
}
}
return $candidates;
}
/**
* Calculates confidence for $set. Confidence is the relative amount of sets containing $subset which also contain
* $set.
*
* @param mixed[] $set
* @param mixed[] $subset
*/
private function confidence(array $set, array $subset): float
{
return $this->support($set) / $this->support($subset);
}
/**
* Calculates support for item set $sample. Support is the relative amount of sets containing $sample in the data
* pool.
*
* @see \Phpml\Association\Apriori::samples
*
* @param mixed[] $sample
*/
private function support(array $sample): float
{
return $this->frequency($sample) / count($this->samples);
}
/**
* Counts occurrences of $sample as subset in data pool.
*
* @see \Phpml\Association\Apriori::samples
*
* @param mixed[] $sample
*/
private function frequency(array $sample): int
{
return count(array_filter($this->samples, function ($entry) use ($sample): bool {
return $this->subset($entry, $sample);
}));
}
/**
* Returns true if set is an element of system.
*
* @see \Phpml\Association\Apriori::equals()
*
* @param mixed[][] $system
* @param mixed[] $set
*/
private function contains(array $system, array $set): bool
{
return (bool) array_filter($system, function ($entry) use ($set): bool {
return $this->equals($entry, $set);
});
}
/**
* Returns true if subset is a (proper) subset of set by its items string representation.
*
* @param mixed[] $set
* @param mixed[] $subset
*/
private function subset(array $set, array $subset): bool
{
return count(array_diff($subset, array_intersect($subset, $set))) === 0;
}
/**
* Returns true if string representation of items does not differ.
*
* @param mixed[] $set1
* @param mixed[] $set2
*/
private function equals(array $set1, array $set2): bool
{
return array_diff($set1, $set2) == array_diff($set2, $set1);
}
}

View File

@ -0,0 +1,11 @@
<?php
declare(strict_types=1);
namespace Phpml\Association;
use Phpml\Estimator;
interface Associator extends Estimator
{
}

View File

@ -0,0 +1,11 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification;
use Phpml\Estimator;
interface Classifier extends Estimator
{
}

View File

@ -0,0 +1,484 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification;
use Phpml\Classification\DecisionTree\DecisionTreeLeaf;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
use Phpml\Math\Statistic\Mean;
class DecisionTree implements Classifier
{
use Trainable;
use Predictable;
public const CONTINUOUS = 1;
public const NOMINAL = 2;
/**
* @var int
*/
public $actualDepth = 0;
/**
* @var array
*/
protected $columnTypes = [];
/**
* @var DecisionTreeLeaf
*/
protected $tree;
/**
* @var int
*/
protected $maxDepth;
/**
* @var array
*/
private $labels = [];
/**
* @var int
*/
private $featureCount = 0;
/**
* @var int
*/
private $numUsableFeatures = 0;
/**
* @var array
*/
private $selectedFeatures = [];
/**
* @var array|null
*/
private $featureImportances;
/**
* @var array
*/
private $columnNames = [];
public function __construct(int $maxDepth = 10)
{
$this->maxDepth = $maxDepth;
}
public function train(array $samples, array $targets): void
{
$this->samples = array_merge($this->samples, $samples);
$this->targets = array_merge($this->targets, $targets);
$this->featureCount = count($this->samples[0]);
$this->columnTypes = self::getColumnTypes($this->samples);
$this->labels = array_keys(array_count_values($this->targets));
$this->tree = $this->getSplitLeaf(range(0, count($this->samples) - 1));
// Each time the tree is trained, feature importances are reset so that
// we will have to compute it again depending on the new data
$this->featureImportances = null;
// If column names are given or computed before, then there is no
// need to init it and accidentally remove the previous given names
if ($this->columnNames === []) {
$this->columnNames = range(0, $this->featureCount - 1);
} elseif (count($this->columnNames) > $this->featureCount) {
$this->columnNames = array_slice($this->columnNames, 0, $this->featureCount);
} elseif (count($this->columnNames) < $this->featureCount) {
$this->columnNames = array_merge(
$this->columnNames,
range(count($this->columnNames), $this->featureCount - 1)
);
}
}
public static function getColumnTypes(array $samples): array
{
$types = [];
$featureCount = count($samples[0]);
for ($i = 0; $i < $featureCount; ++$i) {
$values = array_column($samples, $i);
$isCategorical = self::isCategoricalColumn($values);
$types[] = $isCategorical ? self::NOMINAL : self::CONTINUOUS;
}
return $types;
}
/**
* @param mixed $baseValue
*/
public function getGiniIndex($baseValue, array $colValues, array $targets): float
{
$countMatrix = [];
foreach ($this->labels as $label) {
$countMatrix[$label] = [0, 0];
}
foreach ($colValues as $index => $value) {
$label = $targets[$index];
$rowIndex = $value === $baseValue ? 0 : 1;
++$countMatrix[$label][$rowIndex];
}
$giniParts = [0, 0];
for ($i = 0; $i <= 1; ++$i) {
$part = 0;
$sum = array_sum(array_column($countMatrix, $i));
if ($sum > 0) {
foreach ($this->labels as $label) {
$part += ($countMatrix[$label][$i] / (float) $sum) ** 2;
}
}
$giniParts[$i] = (1 - $part) * $sum;
}
return array_sum($giniParts) / count($colValues);
}
/**
* This method is used to set number of columns to be used
* when deciding a split at an internal node of the tree. <br>
* If the value is given 0, then all features are used (default behaviour),
* otherwise the given value will be used as a maximum for number of columns
* randomly selected for each split operation.
*
* @return $this
*
* @throws InvalidArgumentException
*/
public function setNumFeatures(int $numFeatures)
{
if ($numFeatures < 0) {
throw new InvalidArgumentException('Selected column count should be greater or equal to zero');
}
$this->numUsableFeatures = $numFeatures;
return $this;
}
/**
* A string array to represent columns. Useful when HTML output or
* column importances are desired to be inspected.
*
* @return $this
*
* @throws InvalidArgumentException
*/
public function setColumnNames(array $names)
{
if ($this->featureCount !== 0 && count($names) !== $this->featureCount) {
throw new InvalidArgumentException(sprintf('Length of the given array should be equal to feature count %s', $this->featureCount));
}
$this->columnNames = $names;
return $this;
}
public function getHtml(): string
{
return $this->tree->getHTML($this->columnNames);
}
/**
* This will return an array including an importance value for
* each column in the given dataset. The importance values are
* normalized and their total makes 1.<br/>
*/
public function getFeatureImportances(): array
{
if ($this->featureImportances !== null) {
return $this->featureImportances;
}
$sampleCount = count($this->samples);
$this->featureImportances = [];
foreach ($this->columnNames as $column => $columnName) {
$nodes = $this->getSplitNodesByColumn($column, $this->tree);
$importance = 0;
foreach ($nodes as $node) {
$importance += $node->getNodeImpurityDecrease($sampleCount);
}
$this->featureImportances[$columnName] = $importance;
}
// Normalize & sort the importances
$total = array_sum($this->featureImportances);
if ($total > 0) {
array_walk($this->featureImportances, function (&$importance) use ($total): void {
$importance /= $total;
});
arsort($this->featureImportances);
}
return $this->featureImportances;
}
protected function getSplitLeaf(array $records, int $depth = 0): DecisionTreeLeaf
{
$split = $this->getBestSplit($records);
$split->level = $depth;
if ($this->actualDepth < $depth) {
$this->actualDepth = $depth;
}
// Traverse all records to see if all records belong to the same class,
// otherwise group the records so that we can classify the leaf
// in case maximum depth is reached
$leftRecords = [];
$rightRecords = [];
$remainingTargets = [];
$prevRecord = null;
$allSame = true;
foreach ($records as $recordNo) {
// Check if the previous record is the same with the current one
$record = $this->samples[$recordNo];
if ($prevRecord !== null && $prevRecord != $record) {
$allSame = false;
}
$prevRecord = $record;
// According to the split criteron, this record will
// belong to either left or the right side in the next split
if ($split->evaluate($record)) {
$leftRecords[] = $recordNo;
} else {
$rightRecords[] = $recordNo;
}
// Group remaining targets
$target = $this->targets[$recordNo];
if (!array_key_exists($target, $remainingTargets)) {
$remainingTargets[$target] = 1;
} else {
++$remainingTargets[$target];
}
}
if ($allSame || $depth >= $this->maxDepth || count($remainingTargets) === 1) {
$split->isTerminal = true;
arsort($remainingTargets);
$split->classValue = (string) key($remainingTargets);
} else {
if (isset($leftRecords[0])) {
$split->leftLeaf = $this->getSplitLeaf($leftRecords, $depth + 1);
}
if (isset($rightRecords[0])) {
$split->rightLeaf = $this->getSplitLeaf($rightRecords, $depth + 1);
}
}
return $split;
}
protected function getBestSplit(array $records): DecisionTreeLeaf
{
$targets = array_intersect_key($this->targets, array_flip($records));
$samples = (array) array_combine(
$records,
$this->preprocess(array_intersect_key($this->samples, array_flip($records)))
);
$bestGiniVal = 1;
$bestSplit = null;
$features = $this->getSelectedFeatures();
foreach ($features as $i) {
$colValues = [];
foreach ($samples as $index => $row) {
$colValues[$index] = $row[$i];
}
$counts = array_count_values($colValues);
arsort($counts);
$baseValue = key($counts);
if ($baseValue === null) {
continue;
}
$gini = $this->getGiniIndex($baseValue, $colValues, $targets);
if ($bestSplit === null || $bestGiniVal > $gini) {
$split = new DecisionTreeLeaf();
$split->value = $baseValue;
$split->giniIndex = $gini;
$split->columnIndex = $i;
$split->isContinuous = $this->columnTypes[$i] === self::CONTINUOUS;
$split->records = $records;
// If a numeric column is to be selected, then
// the original numeric value and the selected operator
// will also be saved into the leaf for future access
if ($this->columnTypes[$i] === self::CONTINUOUS) {
$matches = [];
preg_match("/^([<>=]{1,2})\s*(.*)/", (string) $split->value, $matches);
$split->operator = $matches[1];
$split->numericValue = (float) $matches[2];
}
$bestSplit = $split;
$bestGiniVal = $gini;
}
}
return $bestSplit;
}
/**
* Returns available features/columns to the tree for the decision making
* process. <br>
*
* If a number is given with setNumFeatures() method, then a random selection
* of features up to this number is returned. <br>
*
* If some features are manually selected by use of setSelectedFeatures(),
* then only these features are returned <br>
*
* If any of above methods were not called beforehand, then all features
* are returned by default.
*/
protected function getSelectedFeatures(): array
{
$allFeatures = range(0, $this->featureCount - 1);
if ($this->numUsableFeatures === 0 && count($this->selectedFeatures) === 0) {
return $allFeatures;
}
if (count($this->selectedFeatures) > 0) {
return $this->selectedFeatures;
}
$numFeatures = $this->numUsableFeatures;
if ($numFeatures > $this->featureCount) {
$numFeatures = $this->featureCount;
}
shuffle($allFeatures);
$selectedFeatures = array_slice($allFeatures, 0, $numFeatures);
sort($selectedFeatures);
return $selectedFeatures;
}
protected function preprocess(array $samples): array
{
// Detect and convert continuous data column values into
// discrete values by using the median as a threshold value
$columns = [];
for ($i = 0; $i < $this->featureCount; ++$i) {
$values = array_column($samples, $i);
if ($this->columnTypes[$i] == self::CONTINUOUS) {
$median = Mean::median($values);
foreach ($values as &$value) {
if ($value <= $median) {
$value = "<= ${median}";
} else {
$value = "> ${median}";
}
}
}
$columns[] = $values;
}
// Below method is a strange yet very simple & efficient method
// to get the transpose of a 2D array
return array_map(null, ...$columns);
}
protected static function isCategoricalColumn(array $columnValues): bool
{
$count = count($columnValues);
// There are two main indicators that *may* show whether a
// column is composed of discrete set of values:
// 1- Column may contain string values and non-float values
// 2- Number of unique values in the column is only a small fraction of
// all values in that column (Lower than or equal to %20 of all values)
$numericValues = array_filter($columnValues, 'is_numeric');
$floatValues = array_filter($columnValues, 'is_float');
if (count($floatValues) > 0) {
return false;
}
if (count($numericValues) !== $count) {
return true;
}
$distinctValues = array_count_values($columnValues);
return count($distinctValues) <= $count / 5;
}
/**
* Used to set predefined features to consider while deciding which column to use for a split
*/
protected function setSelectedFeatures(array $selectedFeatures): void
{
$this->selectedFeatures = $selectedFeatures;
}
/**
* Collects and returns an array of internal nodes that use the given
* column as a split criterion
*/
protected function getSplitNodesByColumn(int $column, DecisionTreeLeaf $node): array
{
if ($node->isTerminal) {
return [];
}
$nodes = [];
if ($node->columnIndex === $column) {
$nodes[] = $node;
}
$lNodes = [];
$rNodes = [];
if ($node->leftLeaf !== null) {
$lNodes = $this->getSplitNodesByColumn($column, $node->leftLeaf);
}
if ($node->rightLeaf !== null) {
$rNodes = $this->getSplitNodesByColumn($column, $node->rightLeaf);
}
return array_merge($nodes, $lNodes, $rNodes);
}
/**
* @return mixed
*/
protected function predictSample(array $sample)
{
$node = $this->tree;
do {
if ($node->isTerminal) {
return $node->classValue;
}
if ($node->evaluate($sample)) {
$node = $node->leftLeaf;
} else {
$node = $node->rightLeaf;
}
} while ($node);
return $this->labels[0];
}
}

View File

@ -0,0 +1,165 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification\DecisionTree;
use Phpml\Math\Comparison;
class DecisionTreeLeaf
{
/**
* @var string|int
*/
public $value;
/**
* @var float
*/
public $numericValue;
/**
* @var string
*/
public $operator;
/**
* @var int
*/
public $columnIndex;
/**
* @var DecisionTreeLeaf|null
*/
public $leftLeaf;
/**
* @var DecisionTreeLeaf|null
*/
public $rightLeaf;
/**
* @var array
*/
public $records = [];
/**
* Class value represented by the leaf, this value is non-empty
* only for terminal leaves
*
* @var string
*/
public $classValue = '';
/**
* @var bool
*/
public $isTerminal = false;
/**
* @var bool
*/
public $isContinuous = false;
/**
* @var float
*/
public $giniIndex = 0;
/**
* @var int
*/
public $level = 0;
/**
* HTML representation of the tree without column names
*/
public function __toString(): string
{
return $this->getHTML();
}
public function evaluate(array $record): bool
{
$recordField = $record[$this->columnIndex];
if ($this->isContinuous) {
return Comparison::compare((string) $recordField, $this->numericValue, $this->operator);
}
return $recordField == $this->value;
}
/**
* Returns Mean Decrease Impurity (MDI) in the node.
* For terminal nodes, this value is equal to 0
*/
public function getNodeImpurityDecrease(int $parentRecordCount): float
{
if ($this->isTerminal) {
return 0.0;
}
$nodeSampleCount = (float) count($this->records);
$iT = $this->giniIndex;
if ($this->leftLeaf !== null) {
$pL = count($this->leftLeaf->records) / $nodeSampleCount;
$iT -= $pL * $this->leftLeaf->giniIndex;
}
if ($this->rightLeaf !== null) {
$pR = count($this->rightLeaf->records) / $nodeSampleCount;
$iT -= $pR * $this->rightLeaf->giniIndex;
}
return $iT * $nodeSampleCount / $parentRecordCount;
}
/**
* Returns HTML representation of the node including children nodes
*/
public function getHTML(?array $columnNames = null): string
{
if ($this->isTerminal) {
$value = "<b>${this}->classValue</b>";
} else {
$value = $this->value;
if ($columnNames !== null) {
$col = $columnNames[$this->columnIndex];
} else {
$col = "col_$this->columnIndex";
}
if ((bool) preg_match('/^[<>=]{1,2}/', (string) $value) === false) {
$value = "=${value}";
}
$value = "<b>${col} ${value}</b><br>Gini: ".number_format($this->giniIndex, 2);
}
$str = "<table ><tr><td colspan=3 align=center style='border:1px solid;'>${value}</td></tr>";
if ($this->leftLeaf !== null || $this->rightLeaf !== null) {
$str .= '<tr>';
if ($this->leftLeaf !== null) {
$str .= '<td valign=top><b>| Yes</b><br>'.$this->leftLeaf->getHTML($columnNames).'</td>';
} else {
$str .= '<td></td>';
}
$str .= '<td>&nbsp;</td>';
if ($this->rightLeaf !== null) {
$str .= '<td valign=top align=right><b>No |</b><br>'.$this->rightLeaf->getHTML($columnNames).'</td>';
} else {
$str .= '<td></td>';
}
$str .= '</tr>';
}
$str .= '</table>';
return $str;
}
}

View File

@ -0,0 +1,252 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification\Ensemble;
use Phpml\Classification\Classifier;
use Phpml\Classification\Linear\DecisionStump;
use Phpml\Classification\WeightedClassifier;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
use Phpml\Math\Statistic\Mean;
use Phpml\Math\Statistic\StandardDeviation;
use ReflectionClass;
class AdaBoost implements Classifier
{
use Predictable;
use Trainable;
/**
* Actual labels given in the targets array
*
* @var array
*/
protected $labels = [];
/**
* @var int
*/
protected $sampleCount;
/**
* @var int
*/
protected $featureCount;
/**
* Number of maximum iterations to be done
*
* @var int
*/
protected $maxIterations;
/**
* Sample weights
*
* @var array
*/
protected $weights = [];
/**
* List of selected 'weak' classifiers
*
* @var array
*/
protected $classifiers = [];
/**
* Base classifier weights
*
* @var array
*/
protected $alpha = [];
/**
* @var string
*/
protected $baseClassifier = DecisionStump::class;
/**
* @var array
*/
protected $classifierOptions = [];
/**
* ADAptive BOOSTing (AdaBoost) is an ensemble algorithm to
* improve classification performance of 'weak' classifiers such as
* DecisionStump (default base classifier of AdaBoost).
*/
public function __construct(int $maxIterations = 50)
{
$this->maxIterations = $maxIterations;
}
/**
* Sets the base classifier that will be used for boosting (default = DecisionStump)
*/
public function setBaseClassifier(string $baseClassifier = DecisionStump::class, array $classifierOptions = []): void
{
$this->baseClassifier = $baseClassifier;
$this->classifierOptions = $classifierOptions;
}
/**
* @throws InvalidArgumentException
*/
public function train(array $samples, array $targets): void
{
// Initialize usual variables
$this->labels = array_keys(array_count_values($targets));
if (count($this->labels) !== 2) {
throw new InvalidArgumentException('AdaBoost is a binary classifier and can classify between two classes only');
}
// Set all target values to either -1 or 1
$this->labels = [
1 => $this->labels[0],
-1 => $this->labels[1],
];
foreach ($targets as $target) {
$this->targets[] = $target == $this->labels[1] ? 1 : -1;
}
$this->samples = array_merge($this->samples, $samples);
$this->featureCount = count($samples[0]);
$this->sampleCount = count($this->samples);
// Initialize AdaBoost parameters
$this->weights = array_fill(0, $this->sampleCount, 1.0 / $this->sampleCount);
$this->classifiers = [];
$this->alpha = [];
// Execute the algorithm for a maximum number of iterations
$currIter = 0;
while ($this->maxIterations > $currIter++) {
// Determine the best 'weak' classifier based on current weights
$classifier = $this->getBestClassifier();
$errorRate = $this->evaluateClassifier($classifier);
// Update alpha & weight values at each iteration
$alpha = $this->calculateAlpha($errorRate);
$this->updateWeights($classifier, $alpha);
$this->classifiers[] = $classifier;
$this->alpha[] = $alpha;
}
}
/**
* @return mixed
*/
public function predictSample(array $sample)
{
$sum = 0;
foreach ($this->alpha as $index => $alpha) {
$h = $this->classifiers[$index]->predict($sample);
$sum += $h * $alpha;
}
return $this->labels[$sum > 0 ? 1 : -1];
}
/**
* Returns the classifier with the lowest error rate with the
* consideration of current sample weights
*/
protected function getBestClassifier(): Classifier
{
$ref = new ReflectionClass($this->baseClassifier);
/** @var Classifier $classifier */
$classifier = count($this->classifierOptions) === 0 ? $ref->newInstance() : $ref->newInstanceArgs($this->classifierOptions);
if ($classifier instanceof WeightedClassifier) {
$classifier->setSampleWeights($this->weights);
$classifier->train($this->samples, $this->targets);
} else {
[$samples, $targets] = $this->resample();
$classifier->train($samples, $targets);
}
return $classifier;
}
/**
* Resamples the dataset in accordance with the weights and
* returns the new dataset
*/
protected function resample(): array
{
$weights = $this->weights;
$std = StandardDeviation::population($weights);
$mean = Mean::arithmetic($weights);
$min = min($weights);
$minZ = (int) round(($min - $mean) / $std);
$samples = [];
$targets = [];
foreach ($weights as $index => $weight) {
$z = (int) round(($weight - $mean) / $std) - $minZ + 1;
for ($i = 0; $i < $z; ++$i) {
if (random_int(0, 1) == 0) {
continue;
}
$samples[] = $this->samples[$index];
$targets[] = $this->targets[$index];
}
}
return [$samples, $targets];
}
/**
* Evaluates the classifier and returns the classification error rate
*/
protected function evaluateClassifier(Classifier $classifier): float
{
$total = (float) array_sum($this->weights);
$wrong = 0;
foreach ($this->samples as $index => $sample) {
$predicted = $classifier->predict($sample);
if ($predicted != $this->targets[$index]) {
$wrong += $this->weights[$index];
}
}
return $wrong / $total;
}
/**
* Calculates alpha of a classifier
*/
protected function calculateAlpha(float $errorRate): float
{
if ($errorRate == 0) {
$errorRate = 1e-10;
}
return 0.5 * log((1 - $errorRate) / $errorRate);
}
/**
* Updates the sample weights
*/
protected function updateWeights(Classifier $classifier, float $alpha): void
{
$sumOfWeights = array_sum($this->weights);
$weightsT1 = [];
foreach ($this->weights as $index => $weight) {
$desired = $this->targets[$index];
$output = $classifier->predict($this->samples[$index]);
$weight *= exp(-$alpha * $desired * $output) / $sumOfWeights;
$weightsT1[] = $weight;
}
$this->weights = $weightsT1;
}
}

View File

@ -0,0 +1,170 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification\Ensemble;
use Phpml\Classification\Classifier;
use Phpml\Classification\DecisionTree;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
use ReflectionClass;
class Bagging implements Classifier
{
use Trainable;
use Predictable;
/**
* @var int
*/
protected $numSamples;
/**
* @var int
*/
protected $featureCount = 0;
/**
* @var int
*/
protected $numClassifier;
/**
* @var string
*/
protected $classifier = DecisionTree::class;
/**
* @var array
*/
protected $classifierOptions = ['depth' => 20];
/**
* @var array
*/
protected $classifiers = [];
/**
* @var float
*/
protected $subsetRatio = 0.7;
/**
* Creates an ensemble classifier with given number of base classifiers
* Default number of base classifiers is 50.
* The more number of base classifiers, the better performance but at the cost of procesing time
*/
public function __construct(int $numClassifier = 50)
{
$this->numClassifier = $numClassifier;
}
/**
* This method determines the ratio of samples used to create the 'bootstrap' subset,
* e.g., random samples drawn from the original dataset with replacement (allow repeats),
* to train each base classifier.
*
* @return $this
*
* @throws InvalidArgumentException
*/
public function setSubsetRatio(float $ratio)
{
if ($ratio < 0.1 || $ratio > 1.0) {
throw new InvalidArgumentException('Subset ratio should be between 0.1 and 1.0');
}
$this->subsetRatio = $ratio;
return $this;
}
/**
* This method is used to set the base classifier. Default value is
* DecisionTree::class, but any class that implements the <i>Classifier</i>
* can be used. <br>
* While giving the parameters of the classifier, the values should be
* given in the order they are in the constructor of the classifier and parameter
* names are neglected.
*
* @return $this
*/
public function setClassifer(string $classifier, array $classifierOptions = [])
{
$this->classifier = $classifier;
$this->classifierOptions = $classifierOptions;
return $this;
}
public function train(array $samples, array $targets): void
{
$this->samples = array_merge($this->samples, $samples);
$this->targets = array_merge($this->targets, $targets);
$this->featureCount = count($samples[0]);
$this->numSamples = count($this->samples);
// Init classifiers and train them with bootstrap samples
$this->classifiers = $this->initClassifiers();
$index = 0;
foreach ($this->classifiers as $classifier) {
[$samples, $targets] = $this->getRandomSubset($index);
$classifier->train($samples, $targets);
++$index;
}
}
protected function getRandomSubset(int $index): array
{
$samples = [];
$targets = [];
srand($index);
$bootstrapSize = $this->subsetRatio * $this->numSamples;
for ($i = 0; $i < $bootstrapSize; ++$i) {
$rand = random_int(0, $this->numSamples - 1);
$samples[] = $this->samples[$rand];
$targets[] = $this->targets[$rand];
}
return [$samples, $targets];
}
protected function initClassifiers(): array
{
$classifiers = [];
for ($i = 0; $i < $this->numClassifier; ++$i) {
$ref = new ReflectionClass($this->classifier);
/** @var Classifier $obj */
$obj = count($this->classifierOptions) === 0 ? $ref->newInstance() : $ref->newInstanceArgs($this->classifierOptions);
$classifiers[] = $this->initSingleClassifier($obj);
}
return $classifiers;
}
protected function initSingleClassifier(Classifier $classifier): Classifier
{
return $classifier;
}
/**
* @return mixed
*/
protected function predictSample(array $sample)
{
$predictions = [];
foreach ($this->classifiers as $classifier) {
/** @var Classifier $classifier */
$predictions[] = $classifier->predict($sample);
}
$counts = array_count_values($predictions);
arsort($counts);
reset($counts);
return key($counts);
}
}

View File

@ -0,0 +1,157 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification\Ensemble;
use Phpml\Classification\Classifier;
use Phpml\Classification\DecisionTree;
use Phpml\Exception\InvalidArgumentException;
class RandomForest extends Bagging
{
/**
* @var float|string
*/
protected $featureSubsetRatio = 'log';
/**
* @var array|null
*/
protected $columnNames;
/**
* Initializes RandomForest with the given number of trees. More trees
* may increase the prediction performance while it will also substantially
* increase the processing time and the required memory
*/
public function __construct(int $numClassifier = 50)
{
parent::__construct($numClassifier);
$this->setSubsetRatio(1.0);
}
/**
* This method is used to determine how many of the original columns (features)
* will be used to construct subsets to train base classifiers.<br>
*
* Allowed values: 'sqrt', 'log' or any float number between 0.1 and 1.0 <br>
*
* Default value for the ratio is 'log' which results in log(numFeatures, 2) + 1
* features to be taken into consideration while selecting subspace of features
*
* @param mixed $ratio
*/
public function setFeatureSubsetRatio($ratio): self
{
if (!is_string($ratio) && !is_float($ratio)) {
throw new InvalidArgumentException('Feature subset ratio must be a string or a float');
}
if (is_float($ratio) && ($ratio < 0.1 || $ratio > 1.0)) {
throw new InvalidArgumentException('When a float is given, feature subset ratio should be between 0.1 and 1.0');
}
if (is_string($ratio) && $ratio !== 'sqrt' && $ratio !== 'log') {
throw new InvalidArgumentException("When a string is given, feature subset ratio can only be 'sqrt' or 'log'");
}
$this->featureSubsetRatio = $ratio;
return $this;
}
/**
* RandomForest algorithm is usable *only* with DecisionTree
*
* @return $this
*/
public function setClassifer(string $classifier, array $classifierOptions = [])
{
if ($classifier !== DecisionTree::class) {
throw new InvalidArgumentException('RandomForest can only use DecisionTree as base classifier');
}
parent::setClassifer($classifier, $classifierOptions);
return $this;
}
/**
* This will return an array including an importance value for
* each column in the given dataset. Importance values for a column
* is the average importance of that column in all trees in the forest
*/
public function getFeatureImportances(): array
{
// Traverse each tree and sum importance of the columns
$sum = [];
foreach ($this->classifiers as $tree) {
/** @var DecisionTree $tree */
$importances = $tree->getFeatureImportances();
foreach ($importances as $column => $importance) {
if (array_key_exists($column, $sum)) {
$sum[$column] += $importance;
} else {
$sum[$column] = $importance;
}
}
}
// Normalize & sort the importance values
$total = array_sum($sum);
array_walk($sum, function (&$importance) use ($total): void {
$importance /= $total;
});
arsort($sum);
return $sum;
}
/**
* A string array to represent the columns is given. They are useful
* when trying to print some information about the trees such as feature importances
*
* @return $this
*/
public function setColumnNames(array $names)
{
$this->columnNames = $names;
return $this;
}
/**
* @return DecisionTree
*/
protected function initSingleClassifier(Classifier $classifier): Classifier
{
if (!$classifier instanceof DecisionTree) {
throw new InvalidArgumentException(
sprintf('Classifier %s expected, got %s', DecisionTree::class, get_class($classifier))
);
}
if (is_float($this->featureSubsetRatio)) {
$featureCount = (int) ($this->featureSubsetRatio * $this->featureCount);
} elseif ($this->featureSubsetRatio === 'sqrt') {
$featureCount = (int) ($this->featureCount ** .5) + 1;
} else {
$featureCount = (int) log($this->featureCount, 2) + 1;
}
if ($featureCount >= $this->featureCount) {
$featureCount = $this->featureCount;
}
if ($this->columnNames === null) {
$this->columnNames = range(0, $this->featureCount - 1);
}
return $classifier
->setColumnNames($this->columnNames)
->setNumFeatures($featureCount);
}
}

View File

@ -4,14 +4,15 @@ declare (strict_types = 1);
namespace Phpml\Classification; namespace Phpml\Classification;
use Phpml\Classification\Traits\Predictable; use Phpml\Helper\Predictable;
use Phpml\Classification\Traits\Trainable; use Phpml\Helper\Trainable;
use Phpml\Math\Distance; use Phpml\Math\Distance;
use Phpml\Math\Distance\Euclidean; use Phpml\Math\Distance\Euclidean;
class KNearestNeighbors implements Classifier class KNearestNeighbors implements Classifier
{ {
use Trainable, Predictable; use Trainable;
use Predictable;
/** /**
* @var int * @var int
@ -24,34 +25,30 @@ class KNearestNeighbors implements Classifier
private $distanceMetric; private $distanceMetric;
/** /**
* @param int $k
* @param Distance|null $distanceMetric (if null then Euclidean distance as default) * @param Distance|null $distanceMetric (if null then Euclidean distance as default)
*/ */
public function __construct(int $k = 3, Distance $distanceMetric = null) public function __construct(int $k = 3, ?Distance $distanceMetric = null)
{ {
if (null === $distanceMetric) { if ($distanceMetric === null) {
$distanceMetric = new Euclidean(); $distanceMetric = new Euclidean();
} }
$this->k = $k; $this->k = $k;
$this->samples = []; $this->samples = [];
$this->labels = []; $this->targets = [];
$this->distanceMetric = $distanceMetric; $this->distanceMetric = $distanceMetric;
} }
/** /**
* @param array $sample
*
* @return mixed * @return mixed
*/ */
protected function predictSample(array $sample) protected function predictSample(array $sample)
{ {
$distances = $this->kNeighborsDistances($sample); $distances = $this->kNeighborsDistances($sample);
$predictions = (array) array_combine(array_values($this->targets), array_fill(0, count($this->targets), 0));
$predictions = array_combine(array_values($this->labels), array_fill(0, count($this->labels), 0)); foreach (array_keys($distances) as $index) {
++$predictions[$this->targets[$index]];
foreach ($distances as $index => $distance) {
++$predictions[$this->labels[$index]];
} }
arsort($predictions); arsort($predictions);
@ -61,10 +58,6 @@ class KNearestNeighbors implements Classifier
} }
/** /**
* @param array $sample
*
* @return array
*
* @throws \Phpml\Exception\InvalidArgumentException * @throws \Phpml\Exception\InvalidArgumentException
*/ */
private function kNeighborsDistances(array $sample): array private function kNeighborsDistances(array $sample): array

View File

@ -0,0 +1,75 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification\Linear;
use Phpml\Exception\InvalidArgumentException;
class Adaline extends Perceptron
{
/**
* Batch training is the default Adaline training algorithm
*/
public const BATCH_TRAINING = 1;
/**
* Online training: Stochastic gradient descent learning
*/
public const ONLINE_TRAINING = 2;
/**
* Training type may be either 'Batch' or 'Online' learning
*
* @var string|int
*/
protected $trainingType;
/**
* Initalize an Adaline (ADAptive LInear NEuron) classifier with given learning rate and maximum
* number of iterations used while training the classifier <br>
*
* Learning rate should be a float value between 0.0(exclusive) and 1.0 (inclusive) <br>
* Maximum number of iterations can be an integer value greater than 0 <br>
* If normalizeInputs is set to true, then every input given to the algorithm will be standardized
* by use of standard deviation and mean calculation
*
* @throws InvalidArgumentException
*/
public function __construct(
float $learningRate = 0.001,
int $maxIterations = 1000,
bool $normalizeInputs = true,
int $trainingType = self::BATCH_TRAINING
) {
if (!in_array($trainingType, [self::BATCH_TRAINING, self::ONLINE_TRAINING], true)) {
throw new InvalidArgumentException('Adaline can only be trained with batch and online/stochastic gradient descent algorithm');
}
$this->trainingType = $trainingType;
parent::__construct($learningRate, $maxIterations, $normalizeInputs);
}
/**
* Adapts the weights with respect to given samples and targets
* by use of gradient descent learning rule
*/
protected function runTraining(array $samples, array $targets): void
{
// The cost function is the sum of squares
$callback = function ($weights, $sample, $target): array {
$this->weights = $weights;
$output = $this->output($sample);
$gradient = $output - $target;
$error = $gradient ** 2;
return [$error, $gradient];
};
$isBatch = $this->trainingType == self::BATCH_TRAINING;
parent::runGradientDescent($samples, $targets, $callback, $isBatch);
}
}

View File

@ -0,0 +1,319 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification\Linear;
use Phpml\Classification\DecisionTree;
use Phpml\Classification\WeightedClassifier;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\OneVsRest;
use Phpml\Helper\Predictable;
use Phpml\Math\Comparison;
class DecisionStump extends WeightedClassifier
{
use Predictable;
use OneVsRest;
public const AUTO_SELECT = -1;
/**
* @var int
*/
protected $givenColumnIndex;
/**
* @var array
*/
protected $binaryLabels = [];
/**
* Lowest error rate obtained while training/optimizing the model
*
* @var float
*/
protected $trainingErrorRate;
/**
* @var int
*/
protected $column;
/**
* @var mixed
*/
protected $value;
/**
* @var string
*/
protected $operator;
/**
* @var array
*/
protected $columnTypes = [];
/**
* @var int
*/
protected $featureCount;
/**
* @var float
*/
protected $numSplitCount = 100.0;
/**
* Distribution of samples in the leaves
*
* @var array
*/
protected $prob = [];
/**
* A DecisionStump classifier is a one-level deep DecisionTree. It is generally
* used with ensemble algorithms as in the weak classifier role. <br>
*
* If columnIndex is given, then the stump tries to produce a decision node
* on this column, otherwise in cases given the value of -1, the stump itself
* decides which column to take for the decision (Default DecisionTree behaviour)
*/
public function __construct(int $columnIndex = self::AUTO_SELECT)
{
$this->givenColumnIndex = $columnIndex;
}
public function __toString(): string
{
return "IF ${this}->column ${this}->operator ${this}->value ".
'THEN '.$this->binaryLabels[0].' '.
'ELSE '.$this->binaryLabels[1];
}
/**
* While finding best split point for a numerical valued column,
* DecisionStump looks for equally distanced values between minimum and maximum
* values in the column. Given <i>$count</i> value determines how many split
* points to be probed. The more split counts, the better performance but
* worse processing time (Default value is 10.0)
*/
public function setNumericalSplitCount(float $count): void
{
$this->numSplitCount = $count;
}
/**
* @throws InvalidArgumentException
*/
protected function trainBinary(array $samples, array $targets, array $labels): void
{
$this->binaryLabels = $labels;
$this->featureCount = count($samples[0]);
// If a column index is given, it should be among the existing columns
if ($this->givenColumnIndex > count($samples[0]) - 1) {
$this->givenColumnIndex = self::AUTO_SELECT;
}
// Check the size of the weights given.
// If none given, then assign 1 as a weight to each sample
if (count($this->weights) === 0) {
$this->weights = array_fill(0, count($samples), 1);
} else {
$numWeights = count($this->weights);
if ($numWeights !== count($samples)) {
throw new InvalidArgumentException('Number of sample weights does not match with number of samples');
}
}
// Determine type of each column as either "continuous" or "nominal"
$this->columnTypes = DecisionTree::getColumnTypes($samples);
// Try to find the best split in the columns of the dataset
// by calculating error rate for each split point in each column
$columns = range(0, count($samples[0]) - 1);
if ($this->givenColumnIndex !== self::AUTO_SELECT) {
$columns = [$this->givenColumnIndex];
}
$bestSplit = [
'value' => 0,
'operator' => '',
'prob' => [],
'column' => 0,
'trainingErrorRate' => 1.0,
];
foreach ($columns as $col) {
if ($this->columnTypes[$col] == DecisionTree::CONTINUOUS) {
$split = $this->getBestNumericalSplit($samples, $targets, $col);
} else {
$split = $this->getBestNominalSplit($samples, $targets, $col);
}
if ($split['trainingErrorRate'] < $bestSplit['trainingErrorRate']) {
$bestSplit = $split;
}
}
// Assign determined best values to the stump
foreach ($bestSplit as $name => $value) {
$this->{$name} = $value;
}
}
/**
* Determines best split point for the given column
*/
protected function getBestNumericalSplit(array $samples, array $targets, int $col): array
{
$values = array_column($samples, $col);
// Trying all possible points may be accomplished in two general ways:
// 1- Try all values in the $samples array ($values)
// 2- Artificially split the range of values into several parts and try them
// We choose the second one because it is faster in larger datasets
$minValue = min($values);
$maxValue = max($values);
$stepSize = ($maxValue - $minValue) / $this->numSplitCount;
$split = [];
foreach (['<=', '>'] as $operator) {
// Before trying all possible split points, let's first try
// the average value for the cut point
$threshold = array_sum($values) / (float) count($values);
[$errorRate, $prob] = $this->calculateErrorRate($targets, $threshold, $operator, $values);
if (!isset($split['trainingErrorRate']) || $errorRate < $split['trainingErrorRate']) {
$split = [
'value' => $threshold,
'operator' => $operator,
'prob' => $prob,
'column' => $col,
'trainingErrorRate' => $errorRate,
];
}
// Try other possible points one by one
for ($step = $minValue; $step <= $maxValue; $step += $stepSize) {
$threshold = (float) $step;
[$errorRate, $prob] = $this->calculateErrorRate($targets, $threshold, $operator, $values);
if ($errorRate < $split['trainingErrorRate']) {
$split = [
'value' => $threshold,
'operator' => $operator,
'prob' => $prob,
'column' => $col,
'trainingErrorRate' => $errorRate,
];
}
}// for
}
return $split;
}
protected function getBestNominalSplit(array $samples, array $targets, int $col): array
{
$values = array_column($samples, $col);
$valueCounts = array_count_values($values);
$distinctVals = array_keys($valueCounts);
$split = [];
foreach (['=', '!='] as $operator) {
foreach ($distinctVals as $val) {
[$errorRate, $prob] = $this->calculateErrorRate($targets, $val, $operator, $values);
if (!isset($split['trainingErrorRate']) || $split['trainingErrorRate'] < $errorRate) {
$split = [
'value' => $val,
'operator' => $operator,
'prob' => $prob,
'column' => $col,
'trainingErrorRate' => $errorRate,
];
}
}
}
return $split;
}
/**
* Calculates the ratio of wrong predictions based on the new threshold
* value given as the parameter
*/
protected function calculateErrorRate(array $targets, float $threshold, string $operator, array $values): array
{
$wrong = 0.0;
$prob = [];
$leftLabel = $this->binaryLabels[0];
$rightLabel = $this->binaryLabels[1];
foreach ($values as $index => $value) {
if (Comparison::compare($value, $threshold, $operator)) {
$predicted = $leftLabel;
} else {
$predicted = $rightLabel;
}
$target = $targets[$index];
if ((string) $predicted != (string) $targets[$index]) {
$wrong += $this->weights[$index];
}
if (!isset($prob[$predicted][$target])) {
$prob[$predicted][$target] = 0;
}
++$prob[$predicted][$target];
}
// Calculate probabilities: Proportion of labels in each leaf
$dist = array_combine($this->binaryLabels, array_fill(0, 2, 0.0));
foreach ($prob as $leaf => $counts) {
$leafTotal = (float) array_sum($prob[$leaf]);
foreach ($counts as $label => $count) {
if ((string) $leaf == (string) $label) {
$dist[$leaf] = $count / $leafTotal;
}
}
}
return [$wrong / (float) array_sum($this->weights), $dist];
}
/**
* Returns the probability of the sample of belonging to the given label
*
* Probability of a sample is calculated as the proportion of the label
* within the labels of the training samples in the decision node
*
* @param mixed $label
*/
protected function predictProbability(array $sample, $label): float
{
$predicted = $this->predictSampleBinary($sample);
if ((string) $predicted == (string) $label) {
return $this->prob[$label];
}
return 0.0;
}
/**
* @return mixed
*/
protected function predictSampleBinary(array $sample)
{
if (Comparison::compare($sample[$this->column], $this->value, $this->operator)) {
return $this->binaryLabels[0];
}
return $this->binaryLabels[1];
}
protected function resetBinary(): void
{
}
}

View File

@ -0,0 +1,283 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification\Linear;
use Closure;
use Exception;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\Optimizer\ConjugateGradient;
class LogisticRegression extends Adaline
{
/**
* Batch training: Gradient descent algorithm (default)
*/
public const BATCH_TRAINING = 1;
/**
* Online training: Stochastic gradient descent learning
*/
public const ONLINE_TRAINING = 2;
/**
* Conjugate Batch: Conjugate Gradient algorithm
*/
public const CONJUGATE_GRAD_TRAINING = 3;
/**
* Cost function to optimize: 'log' and 'sse' are supported <br>
* - 'log' : log likelihood <br>
* - 'sse' : sum of squared errors <br>
*
* @var string
*/
protected $costFunction = 'log';
/**
* Regularization term: only 'L2' is supported
*
* @var string
*/
protected $penalty = 'L2';
/**
* Lambda (λ) parameter of regularization term. If λ is set to 0, then
* regularization term is cancelled.
*
* @var float
*/
protected $lambda = 0.5;
/**
* Initalize a Logistic Regression classifier with maximum number of iterations
* and learning rule to be applied <br>
*
* Maximum number of iterations can be an integer value greater than 0 <br>
* If normalizeInputs is set to true, then every input given to the algorithm will be standardized
* by use of standard deviation and mean calculation <br>
*
* Cost function can be 'log' for log-likelihood and 'sse' for sum of squared errors <br>
*
* Penalty (Regularization term) can be 'L2' or empty string to cancel penalty term
*
* @throws InvalidArgumentException
*/
public function __construct(
int $maxIterations = 500,
bool $normalizeInputs = true,
int $trainingType = self::CONJUGATE_GRAD_TRAINING,
string $cost = 'log',
string $penalty = 'L2'
) {
$trainingTypes = range(self::BATCH_TRAINING, self::CONJUGATE_GRAD_TRAINING);
if (!in_array($trainingType, $trainingTypes, true)) {
throw new InvalidArgumentException(
'Logistic regression can only be trained with '.
'batch (gradient descent), online (stochastic gradient descent) '.
'or conjugate batch (conjugate gradients) algorithms'
);
}
if (!in_array($cost, ['log', 'sse'], true)) {
throw new InvalidArgumentException(
"Logistic regression cost function can be one of the following: \n".
"'log' for log-likelihood and 'sse' for sum of squared errors"
);
}
if ($penalty !== '' && strtoupper($penalty) !== 'L2') {
throw new InvalidArgumentException('Logistic regression supports only \'L2\' regularization');
}
$this->learningRate = 0.001;
parent::__construct($this->learningRate, $maxIterations, $normalizeInputs);
$this->trainingType = $trainingType;
$this->costFunction = $cost;
$this->penalty = $penalty;
}
/**
* Sets the learning rate if gradient descent algorithm is
* selected for training
*/
public function setLearningRate(float $learningRate): void
{
$this->learningRate = $learningRate;
}
/**
* Lambda (λ) parameter of regularization term. If 0 is given,
* then the regularization term is cancelled
*/
public function setLambda(float $lambda): void
{
$this->lambda = $lambda;
}
/**
* Adapts the weights with respect to given samples and targets
* by use of selected solver
*
* @throws \Exception
*/
protected function runTraining(array $samples, array $targets): void
{
$callback = $this->getCostFunction();
switch ($this->trainingType) {
case self::BATCH_TRAINING:
$this->runGradientDescent($samples, $targets, $callback, true);
return;
case self::ONLINE_TRAINING:
$this->runGradientDescent($samples, $targets, $callback, false);
return;
case self::CONJUGATE_GRAD_TRAINING:
$this->runConjugateGradient($samples, $targets, $callback);
return;
default:
// Not reached
throw new Exception(sprintf('Logistic regression has invalid training type: %d.', $this->trainingType));
}
}
/**
* Executes Conjugate Gradient method to optimize the weights of the LogReg model
*/
protected function runConjugateGradient(array $samples, array $targets, Closure $gradientFunc): void
{
if ($this->optimizer === null) {
$this->optimizer = (new ConjugateGradient($this->featureCount))
->setMaxIterations($this->maxIterations);
}
$this->weights = $this->optimizer->runOptimization($samples, $targets, $gradientFunc);
$this->costValues = $this->optimizer->getCostValues();
}
/**
* Returns the appropriate callback function for the selected cost function
*
* @throws \Exception
*/
protected function getCostFunction(): Closure
{
$penalty = 0;
if ($this->penalty === 'L2') {
$penalty = $this->lambda;
}
switch ($this->costFunction) {
case 'log':
/*
* Negative of Log-likelihood cost function to be minimized:
* J(x) = ( - y . log(h(x)) - (1 - y) . log(1 - h(x)))
*
* If regularization term is given, then it will be added to the cost:
* for L2 : J(x) = J(x) + λ/m . w
*
* The gradient of the cost function to be used with gradient descent:
* ∇J(x) = -(y - h(x)) = (h(x) - y)
*/
return function ($weights, $sample, $y) use ($penalty): array {
$this->weights = $weights;
$hX = $this->output($sample);
// In cases where $hX = 1 or $hX = 0, the log-likelihood
// value will give a NaN, so we fix these values
if ($hX == 1) {
$hX = 1 - 1e-10;
}
if ($hX == 0) {
$hX = 1e-10;
}
$y = $y < 0 ? 0 : 1;
$error = -$y * log($hX) - (1 - $y) * log(1 - $hX);
$gradient = $hX - $y;
return [$error, $gradient, $penalty];
};
case 'sse':
/*
* Sum of squared errors or least squared errors cost function:
* J(x) = (y - h(x))^2
*
* If regularization term is given, then it will be added to the cost:
* for L2 : J(x) = J(x) + λ/m . w
*
* The gradient of the cost function:
* ∇J(x) = -(h(x) - y) . h(x) . (1 - h(x))
*/
return function ($weights, $sample, $y) use ($penalty): array {
$this->weights = $weights;
$hX = $this->output($sample);
$y = $y < 0 ? 0 : 1;
$error = (($y - $hX) ** 2);
$gradient = -($y - $hX) * $hX * (1 - $hX);
return [$error, $gradient, $penalty];
};
default:
// Not reached
throw new Exception(sprintf('Logistic regression has invalid cost function: %s.', $this->costFunction));
}
}
/**
* Returns the output of the network, a float value between 0.0 and 1.0
*/
protected function output(array $sample): float
{
$sum = parent::output($sample);
return 1.0 / (1.0 + exp(-$sum));
}
/**
* Returns the class value (either -1 or 1) for the given input
*/
protected function outputClass(array $sample): int
{
$output = $this->output($sample);
if ($output > 0.5) {
return 1;
}
return -1;
}
/**
* Returns the probability of the sample of belonging to the given label.
*
* The probability is simply taken as the distance of the sample
* to the decision plane.
*
* @param mixed $label
*/
protected function predictProbability(array $sample, $label): float
{
$sample = $this->checkNormalizedSample($sample);
$probability = $this->output($sample);
if (array_search($label, $this->labels, true) > 0) {
return $probability;
}
return 1 - $probability;
}
}

View File

@ -0,0 +1,264 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification\Linear;
use Closure;
use Phpml\Classification\Classifier;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\OneVsRest;
use Phpml\Helper\Optimizer\GD;
use Phpml\Helper\Optimizer\Optimizer;
use Phpml\Helper\Optimizer\StochasticGD;
use Phpml\Helper\Predictable;
use Phpml\IncrementalEstimator;
use Phpml\Preprocessing\Normalizer;
class Perceptron implements Classifier, IncrementalEstimator
{
use Predictable;
use OneVsRest;
/**
* @var Optimizer|GD|StochasticGD|null
*/
protected $optimizer;
/**
* @var array
*/
protected $labels = [];
/**
* @var int
*/
protected $featureCount = 0;
/**
* @var array
*/
protected $weights = [];
/**
* @var float
*/
protected $learningRate;
/**
* @var int
*/
protected $maxIterations;
/**
* @var Normalizer
*/
protected $normalizer;
/**
* @var bool
*/
protected $enableEarlyStop = true;
/**
* Initalize a perceptron classifier with given learning rate and maximum
* number of iterations used while training the perceptron
*
* @param float $learningRate Value between 0.0(exclusive) and 1.0(inclusive)
* @param int $maxIterations Must be at least 1
*
* @throws InvalidArgumentException
*/
public function __construct(float $learningRate = 0.001, int $maxIterations = 1000, bool $normalizeInputs = true)
{
if ($learningRate <= 0.0 || $learningRate > 1.0) {
throw new InvalidArgumentException('Learning rate should be a float value between 0.0(exclusive) and 1.0(inclusive)');
}
if ($maxIterations <= 0) {
throw new InvalidArgumentException('Maximum number of iterations must be an integer greater than 0');
}
if ($normalizeInputs) {
$this->normalizer = new Normalizer(Normalizer::NORM_STD);
}
$this->learningRate = $learningRate;
$this->maxIterations = $maxIterations;
}
public function partialTrain(array $samples, array $targets, array $labels = []): void
{
$this->trainByLabel($samples, $targets, $labels);
}
public function trainBinary(array $samples, array $targets, array $labels): void
{
if ($this->normalizer !== null) {
$this->normalizer->transform($samples);
}
// Set all target values to either -1 or 1
$this->labels = [
1 => $labels[0],
-1 => $labels[1],
];
foreach ($targets as $key => $target) {
$targets[$key] = (string) $target == (string) $this->labels[1] ? 1 : -1;
}
// Set samples and feature count vars
$this->featureCount = count($samples[0]);
$this->runTraining($samples, $targets);
}
/**
* Normally enabling early stopping for the optimization procedure may
* help saving processing time while in some cases it may result in
* premature convergence.<br>
*
* If "false" is given, the optimization procedure will always be executed
* for $maxIterations times
*
* @return $this
*/
public function setEarlyStop(bool $enable = true)
{
$this->enableEarlyStop = $enable;
return $this;
}
/**
* Returns the cost values obtained during the training.
*/
public function getCostValues(): array
{
return $this->costValues;
}
protected function resetBinary(): void
{
$this->labels = [];
$this->optimizer = null;
$this->featureCount = 0;
$this->weights = [];
$this->costValues = [];
}
/**
* Trains the perceptron model with Stochastic Gradient Descent optimization
* to get the correct set of weights
*/
protected function runTraining(array $samples, array $targets): void
{
// The cost function is the sum of squares
$callback = function ($weights, $sample, $target): array {
$this->weights = $weights;
$prediction = $this->outputClass($sample);
$gradient = $prediction - $target;
$error = $gradient ** 2;
return [$error, $gradient];
};
$this->runGradientDescent($samples, $targets, $callback);
}
/**
* Executes a Gradient Descent algorithm for
* the given cost function
*/
protected function runGradientDescent(array $samples, array $targets, Closure $gradientFunc, bool $isBatch = false): void
{
$class = $isBatch ? GD::class : StochasticGD::class;
if ($this->optimizer === null) {
$this->optimizer = (new $class($this->featureCount))
->setLearningRate($this->learningRate)
->setMaxIterations($this->maxIterations)
->setChangeThreshold(1e-6)
->setEarlyStop($this->enableEarlyStop);
}
$this->weights = $this->optimizer->runOptimization($samples, $targets, $gradientFunc);
$this->costValues = $this->optimizer->getCostValues();
}
/**
* Checks if the sample should be normalized and if so, returns the
* normalized sample
*/
protected function checkNormalizedSample(array $sample): array
{
if ($this->normalizer !== null) {
$samples = [$sample];
$this->normalizer->transform($samples);
$sample = $samples[0];
}
return $sample;
}
/**
* Calculates net output of the network as a float value for the given input
*
* @return int|float
*/
protected function output(array $sample)
{
$sum = 0;
foreach ($this->weights as $index => $w) {
if ($index == 0) {
$sum += $w;
} else {
$sum += $w * $sample[$index - 1];
}
}
return $sum;
}
/**
* Returns the class value (either -1 or 1) for the given input
*/
protected function outputClass(array $sample): int
{
return $this->output($sample) > 0 ? 1 : -1;
}
/**
* Returns the probability of the sample of belonging to the given label.
*
* The probability is simply taken as the distance of the sample
* to the decision plane.
*
* @param mixed $label
*/
protected function predictProbability(array $sample, $label): float
{
$predicted = $this->predictSampleBinary($sample);
if ((string) $predicted == (string) $label) {
$sample = $this->checkNormalizedSample($sample);
return (float) abs($this->output($sample));
}
return 0.0;
}
/**
* @return mixed
*/
protected function predictSampleBinary(array $sample)
{
$sample = $this->checkNormalizedSample($sample);
$predictedClass = $this->outputClass($sample);
return $this->labels[$predictedClass];
}
}

View File

@ -0,0 +1,58 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification;
use Phpml\Exception\InvalidArgumentException;
use Phpml\NeuralNetwork\Network\MultilayerPerceptron;
class MLPClassifier extends MultilayerPerceptron implements Classifier
{
/**
* @param mixed $target
*
* @throws InvalidArgumentException
*/
public function getTargetClass($target): int
{
if (!in_array($target, $this->classes, true)) {
throw new InvalidArgumentException(
sprintf('Target with value "%s" is not part of the accepted classes', $target)
);
}
return array_search($target, $this->classes, true);
}
/**
* @return mixed
*/
protected function predictSample(array $sample)
{
$output = $this->setInput($sample)->getOutput();
$predictedClass = null;
$max = 0;
foreach ($output as $class => $value) {
if ($value > $max) {
$predictedClass = $class;
$max = $value;
}
}
return $predictedClass;
}
/**
* @param mixed $target
*/
protected function trainSample(array $sample, $target): void
{
// Feed-forward.
$this->setInput($sample);
// Back-propagate.
$this->backpropagation->backpropagate($this->getLayers(), $this->getTargetClass($target));
}
}

View File

@ -0,0 +1,184 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
use Phpml\Math\Statistic\Mean;
use Phpml\Math\Statistic\StandardDeviation;
class NaiveBayes implements Classifier
{
use Trainable;
use Predictable;
public const CONTINUOS = 1;
public const NOMINAL = 2;
public const EPSILON = 1e-10;
/**
* @var array
*/
private $std = [];
/**
* @var array
*/
private $mean = [];
/**
* @var array
*/
private $discreteProb = [];
/**
* @var array
*/
private $dataType = [];
/**
* @var array
*/
private $p = [];
/**
* @var int
*/
private $sampleCount = 0;
/**
* @var int
*/
private $featureCount = 0;
/**
* @var array
*/
private $labels = [];
public function train(array $samples, array $targets): void
{
$this->samples = array_merge($this->samples, $samples);
$this->targets = array_merge($this->targets, $targets);
$this->sampleCount = count($this->samples);
$this->featureCount = count($this->samples[0]);
$this->labels = array_map('strval', array_flip(array_flip($this->targets)));
foreach ($this->labels as $label) {
$samples = $this->getSamplesByLabel($label);
$this->p[$label] = count($samples) / $this->sampleCount;
$this->calculateStatistics($label, $samples);
}
}
/**
* @return mixed
*/
protected function predictSample(array $sample)
{
// Use NaiveBayes assumption for each label using:
// P(label|features) = P(label) * P(feature0|label) * P(feature1|label) .... P(featureN|label)
// Then compare probability for each class to determine which label is most likely
$predictions = [];
foreach ($this->labels as $label) {
$p = $this->p[$label];
for ($i = 0; $i < $this->featureCount; ++$i) {
$Plf = $this->sampleProbability($sample, $i, $label);
$p += $Plf;
}
$predictions[$label] = $p;
}
arsort($predictions, SORT_NUMERIC);
reset($predictions);
return key($predictions);
}
/**
* Calculates vital statistics for each label & feature. Stores these
* values in private array in order to avoid repeated calculation
*/
private function calculateStatistics(string $label, array $samples): void
{
$this->std[$label] = array_fill(0, $this->featureCount, 0);
$this->mean[$label] = array_fill(0, $this->featureCount, 0);
$this->dataType[$label] = array_fill(0, $this->featureCount, self::CONTINUOS);
$this->discreteProb[$label] = array_fill(0, $this->featureCount, self::CONTINUOS);
for ($i = 0; $i < $this->featureCount; ++$i) {
// Get the values of nth column in the samples array
// Mean::arithmetic is called twice, can be optimized
$values = array_column($samples, $i);
$numValues = count($values);
// if the values contain non-numeric data,
// then it should be treated as nominal/categorical/discrete column
if ($values !== array_filter($values, 'is_numeric')) {
$this->dataType[$label][$i] = self::NOMINAL;
$this->discreteProb[$label][$i] = array_count_values($values);
$db = &$this->discreteProb[$label][$i];
$db = array_map(function ($el) use ($numValues) {
return $el / $numValues;
}, $db);
} else {
$this->mean[$label][$i] = Mean::arithmetic($values);
// Add epsilon in order to avoid zero stdev
$this->std[$label][$i] = 1e-10 + StandardDeviation::population($values, false);
}
}
}
/**
* Calculates the probability P(label|sample_n)
*/
private function sampleProbability(array $sample, int $feature, string $label): float
{
if (!isset($sample[$feature])) {
throw new InvalidArgumentException('Missing feature. All samples must have equal number of features');
}
$value = $sample[$feature];
if ($this->dataType[$label][$feature] == self::NOMINAL) {
if (!isset($this->discreteProb[$label][$feature][$value]) ||
$this->discreteProb[$label][$feature][$value] == 0) {
return self::EPSILON;
}
return $this->discreteProb[$label][$feature][$value];
}
$std = $this->std[$label][$feature];
$mean = $this->mean[$label][$feature];
// Calculate the probability density by use of normal/Gaussian distribution
// Ref: https://en.wikipedia.org/wiki/Normal_distribution
//
// In order to avoid numerical errors because of small or zero values,
// some libraries adopt taking log of calculations such as
// scikit-learn did.
// (See : https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/naive_bayes.py)
$pdf = -0.5 * log(2.0 * M_PI * $std * $std);
$pdf -= 0.5 * (($value - $mean) ** 2) / ($std * $std);
return $pdf;
}
/**
* Return samples belonging to specific label
*/
private function getSamplesByLabel(string $label): array
{
$samples = [];
for ($i = 0; $i < $this->sampleCount; ++$i) {
if ($this->targets[$i] == $label) {
$samples[] = $this->samples[$i];
}
}
return $samples;
}
}

View File

@ -0,0 +1,26 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification;
use Phpml\SupportVectorMachine\Kernel;
use Phpml\SupportVectorMachine\SupportVectorMachine;
use Phpml\SupportVectorMachine\Type;
class SVC extends SupportVectorMachine implements Classifier
{
public function __construct(
int $kernel = Kernel::RBF,
float $cost = 1.0,
int $degree = 3,
?float $gamma = null,
float $coef0 = 0.0,
float $tolerance = 0.001,
int $cacheSize = 100,
bool $shrinking = true,
bool $probabilityEstimates = false
) {
parent::__construct(Type::C_SVC, $kernel, $cost, 0.5, $degree, $gamma, $coef0, 0.1, $tolerance, $cacheSize, $shrinking, $probabilityEstimates);
}
}

View File

@ -0,0 +1,21 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification;
abstract class WeightedClassifier implements Classifier
{
/**
* @var array
*/
protected $weights = [];
/**
* Sets the array including a weight for each sample
*/
public function setSampleWeights(array $weights): void
{
$this->weights = $weights;
}
}

View File

@ -0,0 +1,10 @@
<?php
declare(strict_types=1);
namespace Phpml\Clustering;
interface Clusterer
{
public function cluster(array $samples): array;
}

120
src/Clustering/DBSCAN.php Normal file
View File

@ -0,0 +1,120 @@
<?php
declare(strict_types=1);
namespace Phpml\Clustering;
use Phpml\Math\Distance;
use Phpml\Math\Distance\Euclidean;
class DBSCAN implements Clusterer
{
private const NOISE = -1;
/**
* @var float
*/
private $epsilon;
/**
* @var int
*/
private $minSamples;
/**
* @var Distance
*/
private $distanceMetric;
public function __construct(float $epsilon = 0.5, int $minSamples = 3, ?Distance $distanceMetric = null)
{
if ($distanceMetric === null) {
$distanceMetric = new Euclidean();
}
$this->epsilon = $epsilon;
$this->minSamples = $minSamples;
$this->distanceMetric = $distanceMetric;
}
public function cluster(array $samples): array
{
$labels = [];
$n = 0;
foreach ($samples as $index => $sample) {
if (isset($labels[$index])) {
continue;
}
$neighborIndices = $this->getIndicesInRegion($sample, $samples);
if (count($neighborIndices) < $this->minSamples) {
$labels[$index] = self::NOISE;
continue;
}
$labels[$index] = $n;
$this->expandCluster($samples, $neighborIndices, $labels, $n);
++$n;
}
return $this->groupByCluster($samples, $labels, $n);
}
private function expandCluster(array $samples, array $seeds, array &$labels, int $n): void
{
while (($index = array_pop($seeds)) !== null) {
if (isset($labels[$index])) {
if ($labels[$index] === self::NOISE) {
$labels[$index] = $n;
}
continue;
}
$labels[$index] = $n;
$sample = $samples[$index];
$neighborIndices = $this->getIndicesInRegion($sample, $samples);
if (count($neighborIndices) >= $this->minSamples) {
$seeds = array_unique(array_merge($seeds, $neighborIndices));
}
}
}
private function getIndicesInRegion(array $center, array $samples): array
{
$indices = [];
foreach ($samples as $index => $sample) {
if ($this->distanceMetric->distance($center, $sample) < $this->epsilon) {
$indices[] = $index;
}
}
return $indices;
}
private function groupByCluster(array $samples, array $labels, int $n): array
{
$clusters = array_fill(0, $n, []);
foreach ($samples as $index => $sample) {
if ($labels[$index] !== self::NOISE) {
$clusters[$labels[$index]][$index] = $sample;
}
}
// Reindex (i.e. to 0, 1, 2, ...) integer indices for backword compatibility
foreach ($clusters as $index => $cluster) {
$clusters[$index] = array_merge($cluster, []);
}
return $clusters;
}
}

View File

@ -0,0 +1,236 @@
<?php
declare(strict_types=1);
namespace Phpml\Clustering;
use Phpml\Clustering\KMeans\Cluster;
use Phpml\Clustering\KMeans\Point;
use Phpml\Clustering\KMeans\Space;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Math\Distance\Euclidean;
class FuzzyCMeans implements Clusterer
{
/**
* @var int
*/
private $clustersNumber;
/**
* @var Cluster[]
*/
private $clusters = [];
/**
* @var Space
*/
private $space;
/**
* @var float[][]
*/
private $membership = [];
/**
* @var float
*/
private $fuzziness;
/**
* @var float
*/
private $epsilon;
/**
* @var int
*/
private $maxIterations;
/**
* @var int
*/
private $sampleCount;
/**
* @var array
*/
private $samples = [];
/**
* @throws InvalidArgumentException
*/
public function __construct(int $clustersNumber, float $fuzziness = 2.0, float $epsilon = 1e-2, int $maxIterations = 100)
{
if ($clustersNumber <= 0) {
throw new InvalidArgumentException('Invalid clusters number');
}
$this->clustersNumber = $clustersNumber;
$this->fuzziness = $fuzziness;
$this->epsilon = $epsilon;
$this->maxIterations = $maxIterations;
}
public function getMembershipMatrix(): array
{
return $this->membership;
}
public function cluster(array $samples): array
{
// Initialize variables, clusters and membership matrix
$this->sampleCount = count($samples);
$this->samples = &$samples;
$this->space = new Space(count($samples[0]));
$this->initClusters();
// Our goal is minimizing the objective value while
// executing the clustering steps at a maximum number of iterations
$lastObjective = 0.0;
$iterations = 0;
do {
// Update the membership matrix and cluster centers, respectively
$this->updateMembershipMatrix();
$this->updateClusters();
// Calculate the new value of the objective function
$objectiveVal = $this->getObjective();
$difference = abs($lastObjective - $objectiveVal);
$lastObjective = $objectiveVal;
} while ($difference > $this->epsilon && $iterations++ <= $this->maxIterations);
// Attach (hard cluster) each data point to the nearest cluster
for ($k = 0; $k < $this->sampleCount; ++$k) {
$column = array_column($this->membership, $k);
arsort($column);
reset($column);
$cluster = $this->clusters[key($column)];
$cluster->attach(new Point($this->samples[$k]));
}
// Return grouped samples
$grouped = [];
foreach ($this->clusters as $cluster) {
$grouped[] = $cluster->getPoints();
}
return $grouped;
}
protected function initClusters(): void
{
// Membership array is a matrix of cluster number by sample counts
// We initilize the membership array with random values
$dim = $this->space->getDimension();
$this->generateRandomMembership($dim, $this->sampleCount);
$this->updateClusters();
}
protected function generateRandomMembership(int $rows, int $cols): void
{
$this->membership = [];
for ($i = 0; $i < $rows; ++$i) {
$row = [];
$total = 0.0;
for ($k = 0; $k < $cols; ++$k) {
$val = random_int(1, 5) / 10.0;
$row[] = $val;
$total += $val;
}
$this->membership[] = array_map(static function ($val) use ($total): float {
return $val / $total;
}, $row);
}
}
protected function updateClusters(): void
{
$dim = $this->space->getDimension();
if (count($this->clusters) === 0) {
for ($i = 0; $i < $this->clustersNumber; ++$i) {
$this->clusters[] = new Cluster($this->space, array_fill(0, $dim, 0.0));
}
}
for ($i = 0; $i < $this->clustersNumber; ++$i) {
$cluster = $this->clusters[$i];
$center = $cluster->getCoordinates();
for ($k = 0; $k < $dim; ++$k) {
$a = $this->getMembershipRowTotal($i, $k, true);
$b = $this->getMembershipRowTotal($i, $k, false);
$center[$k] = $a / $b;
}
$cluster->setCoordinates($center);
}
}
protected function getMembershipRowTotal(int $row, int $col, bool $multiply): float
{
$sum = 0.0;
for ($k = 0; $k < $this->sampleCount; ++$k) {
$val = $this->membership[$row][$k] ** $this->fuzziness;
if ($multiply) {
$val *= $this->samples[$k][$col];
}
$sum += $val;
}
return $sum;
}
protected function updateMembershipMatrix(): void
{
for ($i = 0; $i < $this->clustersNumber; ++$i) {
for ($k = 0; $k < $this->sampleCount; ++$k) {
$distCalc = $this->getDistanceCalc($i, $k);
$this->membership[$i][$k] = 1.0 / $distCalc;
}
}
}
protected function getDistanceCalc(int $row, int $col): float
{
$sum = 0.0;
$distance = new Euclidean();
$dist1 = $distance->distance(
$this->clusters[$row]->getCoordinates(),
$this->samples[$col]
);
for ($j = 0; $j < $this->clustersNumber; ++$j) {
$dist2 = $distance->distance(
$this->clusters[$j]->getCoordinates(),
$this->samples[$col]
);
$val = (($dist1 / $dist2) ** 2.0) / ($this->fuzziness - 1);
$sum += $val;
}
return $sum;
}
/**
* The objective is to minimize the distance between all data points
* and all cluster centers. This method returns the summation of all
* these distances
*/
protected function getObjective(): float
{
$sum = 0.0;
$distance = new Euclidean();
for ($i = 0; $i < $this->clustersNumber; ++$i) {
$clust = $this->clusters[$i]->getCoordinates();
for ($k = 0; $k < $this->sampleCount; ++$k) {
$point = $this->samples[$k];
$sum += $distance->distance($clust, $point);
}
}
return $sum;
}
}

View File

@ -9,8 +9,9 @@ use Phpml\Exception\InvalidArgumentException;
class KMeans implements Clusterer class KMeans implements Clusterer
{ {
const INIT_RANDOM = 1; public const INIT_RANDOM = 1;
const INIT_KMEANS_PLUS_PLUS = 2;
public const INIT_KMEANS_PLUS_PLUS = 2;
/** /**
* @var int * @var int
@ -22,32 +23,21 @@ class KMeans implements Clusterer
*/ */
private $initialization; private $initialization;
/**
* @param int $clustersNumber
* @param int $initialization
*
* @throws InvalidArgumentException
*/
public function __construct(int $clustersNumber, int $initialization = self::INIT_KMEANS_PLUS_PLUS) public function __construct(int $clustersNumber, int $initialization = self::INIT_KMEANS_PLUS_PLUS)
{ {
if ($clustersNumber <= 0) { if ($clustersNumber <= 0) {
throw InvalidArgumentException::invalidClustersNumber(); throw new InvalidArgumentException('Invalid clusters number');
} }
$this->clustersNumber = $clustersNumber; $this->clustersNumber = $clustersNumber;
$this->initialization = $initialization; $this->initialization = $initialization;
} }
/** public function cluster(array $samples): array
* @param array $samples
*
* @return array
*/
public function cluster(array $samples)
{ {
$space = new Space(count($samples[0])); $space = new Space(count(reset($samples)));
foreach ($samples as $sample) { foreach ($samples as $key => $sample) {
$space->addPoint($sample); $space->addPoint($sample, $key);
} }
$clusters = []; $clusters = [];

View File

@ -5,11 +5,10 @@ declare (strict_types = 1);
namespace Phpml\Clustering\KMeans; namespace Phpml\Clustering\KMeans;
use IteratorAggregate; use IteratorAggregate;
use Countable;
use SplObjectStorage;
use LogicException; use LogicException;
use SplObjectStorage;
class Cluster extends Point implements IteratorAggregate, Countable class Cluster extends Point implements IteratorAggregate
{ {
/** /**
* @var Space * @var Space
@ -21,10 +20,6 @@ class Cluster extends Point implements IteratorAggregate, Countable
*/ */
protected $points; protected $points;
/**
* @param Space $space
* @param array $coordinates
*/
public function __construct(Space $space, array $coordinates) public function __construct(Space $space, array $coordinates)
{ {
parent::__construct($coordinates); parent::__construct($coordinates);
@ -32,39 +27,32 @@ class Cluster extends Point implements IteratorAggregate, Countable
$this->points = new SplObjectStorage(); $this->points = new SplObjectStorage();
} }
/** public function getPoints(): array
* @return array
*/
public function getPoints()
{ {
$points = []; $points = [];
foreach ($this->points as $point) { foreach ($this->points as $point) {
if ($point->label === null) {
$points[] = $point->toArray(); $points[] = $point->toArray();
} else {
$points[$point->label] = $point->toArray();
}
} }
return $points; return $points;
} }
/** public function toArray(): array
* @return array
*/
public function toArray()
{ {
return array( return [
'centroid' => parent::toArray(), 'centroid' => parent::toArray(),
'points' => $this->getPoints(), 'points' => $this->getPoints(),
); ];
} }
/** public function attach(Point $point): Point
* @param Point $point
*
* @return Point
*/
public function attach(Point $point)
{ {
if ($point instanceof self) { if ($point instanceof self) {
throw new LogicException('cannot attach a cluster to another'); throw new LogicException('Cannot attach a cluster to another');
} }
$this->points->attach($point); $this->points->attach($point);
@ -72,37 +60,27 @@ class Cluster extends Point implements IteratorAggregate, Countable
return $point; return $point;
} }
/** public function detach(Point $point): Point
* @param Point $point
*
* @return Point
*/
public function detach(Point $point)
{ {
$this->points->detach($point); $this->points->detach($point);
return $point; return $point;
} }
/** public function attachAll(SplObjectStorage $points): void
* @param SplObjectStorage $points
*/
public function attachAll(SplObjectStorage $points)
{ {
$this->points->addAll($points); $this->points->addAll($points);
} }
/** public function detachAll(SplObjectStorage $points): void
* @param SplObjectStorage $points
*/
public function detachAll(SplObjectStorage $points)
{ {
$this->points->removeAll($points); $this->points->removeAll($points);
} }
public function updateCentroid() public function updateCentroid(): void
{ {
if (!$count = count($this->points)) { $count = count($this->points);
if ($count === 0) {
return; return;
} }
@ -127,11 +105,13 @@ class Cluster extends Point implements IteratorAggregate, Countable
return $this->points; return $this->points;
} }
/** public function count(): int
* @return mixed
*/
public function count()
{ {
return count($this->points); return count($this->points);
} }
public function setCoordinates(array $newCoordinates): void
{
$this->coordinates = $newCoordinates;
}
} }

View File

@ -6,7 +6,7 @@ namespace Phpml\Clustering\KMeans;
use ArrayAccess; use ArrayAccess;
class Point implements ArrayAccess class Point implements ArrayAccess, \Countable
{ {
/** /**
* @var int * @var int
@ -16,32 +16,32 @@ class Point implements ArrayAccess
/** /**
* @var array * @var array
*/ */
protected $coordinates; protected $coordinates = [];
/** /**
* @param array $coordinates * @var mixed
*/ */
public function __construct(array $coordinates) protected $label;
/**
* @param mixed $label
*/
public function __construct(array $coordinates, $label = null)
{ {
$this->dimension = count($coordinates); $this->dimension = count($coordinates);
$this->coordinates = $coordinates; $this->coordinates = $coordinates;
$this->label = $label;
} }
/** public function toArray(): array
* @return array
*/
public function toArray()
{ {
return $this->coordinates; return $this->coordinates;
} }
/** /**
* @param Point $point * @return float|int
* @param bool $precise
*
* @return int|mixed
*/ */
public function getDistanceWith(self $point, $precise = true) public function getDistanceWith(self $point, bool $precise = true)
{ {
$distance = 0; $distance = 0;
for ($n = 0; $n < $this->dimension; ++$n) { for ($n = 0; $n < $this->dimension; ++$n) {
@ -49,22 +49,23 @@ class Point implements ArrayAccess
$distance += $difference * $difference; $distance += $difference * $difference;
} }
return $precise ? sqrt($distance) : $distance; return $precise ? $distance ** .5 : $distance;
} }
/** /**
* @param $points * @param Point[] $points
*
* @return mixed
*/ */
public function getClosest($points) public function getClosest(array $points): ?self
{ {
$minPoint = null;
foreach ($points as $point) { foreach ($points as $point) {
$distance = $this->getDistanceWith($point, false); $distance = $this->getDistanceWith($point, false);
if (!isset($minDistance)) { if (!isset($minDistance)) {
$minDistance = $distance; $minDistance = $distance;
$minPoint = $point; $minPoint = $point;
continue; continue;
} }
@ -77,20 +78,15 @@ class Point implements ArrayAccess
return $minPoint; return $minPoint;
} }
/** public function getCoordinates(): array
* @return array
*/
public function getCoordinates()
{ {
return $this->coordinates; return $this->coordinates;
} }
/** /**
* @param mixed $offset * @param mixed $offset
*
* @return bool
*/ */
public function offsetExists($offset) public function offsetExists($offset): bool
{ {
return isset($this->coordinates[$offset]); return isset($this->coordinates[$offset]);
} }
@ -109,7 +105,7 @@ class Point implements ArrayAccess
* @param mixed $offset * @param mixed $offset
* @param mixed $value * @param mixed $value
*/ */
public function offsetSet($offset, $value) public function offsetSet($offset, $value): void
{ {
$this->coordinates[$offset] = $value; $this->coordinates[$offset] = $value;
} }
@ -117,8 +113,13 @@ class Point implements ArrayAccess
/** /**
* @param mixed $offset * @param mixed $offset
*/ */
public function offsetUnset($offset) public function offsetUnset($offset): void
{ {
unset($this->coordinates[$offset]); unset($this->coordinates[$offset]);
} }
public function count(): int
{
return count($this->coordinates);
}
} }

View File

@ -0,0 +1,263 @@
<?php
declare(strict_types=1);
namespace Phpml\Clustering\KMeans;
use InvalidArgumentException;
use LogicException;
use Phpml\Clustering\KMeans;
use SplObjectStorage;
class Space extends SplObjectStorage
{
/**
* @var int
*/
protected $dimension;
public function __construct(int $dimension)
{
if ($dimension < 1) {
throw new LogicException('a space dimension cannot be null or negative');
}
$this->dimension = $dimension;
}
public function toArray(): array
{
$points = [];
/** @var Point $point */
foreach ($this as $point) {
$points[] = $point->toArray();
}
return ['points' => $points];
}
/**
* @param mixed $label
*/
public function newPoint(array $coordinates, $label = null): Point
{
if (count($coordinates) !== $this->dimension) {
throw new LogicException('('.implode(',', $coordinates).') is not a point of this space');
}
return new Point($coordinates, $label);
}
/**
* @param mixed $label
* @param mixed $data
*/
public function addPoint(array $coordinates, $label = null, $data = null): void
{
$this->attach($this->newPoint($coordinates, $label), $data);
}
/**
* @param object $point
* @param mixed $data
*/
public function attach($point, $data = null): void
{
if (!$point instanceof Point) {
throw new InvalidArgumentException('can only attach points to spaces');
}
parent::attach($point, $data);
}
public function getDimension(): int
{
return $this->dimension;
}
/**
* @return array|bool
*/
public function getBoundaries()
{
if (count($this) === 0) {
return false;
}
$min = $this->newPoint(array_fill(0, $this->dimension, null));
$max = $this->newPoint(array_fill(0, $this->dimension, null));
/** @var Point $point */
foreach ($this as $point) {
for ($n = 0; $n < $this->dimension; ++$n) {
if ($min[$n] === null || $min[$n] > $point[$n]) {
$min[$n] = $point[$n];
}
if ($max[$n] === null || $max[$n] < $point[$n]) {
$max[$n] = $point[$n];
}
}
}
return [$min, $max];
}
public function getRandomPoint(Point $min, Point $max): Point
{
$point = $this->newPoint(array_fill(0, $this->dimension, null));
for ($n = 0; $n < $this->dimension; ++$n) {
$point[$n] = random_int($min[$n], $max[$n]);
}
return $point;
}
/**
* @return Cluster[]
*/
public function cluster(int $clustersNumber, int $initMethod = KMeans::INIT_RANDOM): array
{
$clusters = $this->initializeClusters($clustersNumber, $initMethod);
do {
} while (!$this->iterate($clusters));
return $clusters;
}
/**
* @return Cluster[]
*/
protected function initializeClusters(int $clustersNumber, int $initMethod): array
{
switch ($initMethod) {
case KMeans::INIT_RANDOM:
$clusters = $this->initializeRandomClusters($clustersNumber);
break;
case KMeans::INIT_KMEANS_PLUS_PLUS:
$clusters = $this->initializeKMPPClusters($clustersNumber);
break;
default:
return [];
}
$clusters[0]->attachAll($this);
return $clusters;
}
/**
* @param Cluster[] $clusters
*/
protected function iterate(array $clusters): bool
{
$convergence = true;
$attach = new SplObjectStorage();
$detach = new SplObjectStorage();
foreach ($clusters as $cluster) {
foreach ($cluster as $point) {
$closest = $point->getClosest($clusters);
if ($closest === null) {
continue;
}
if ($closest !== $cluster) {
$attach[$closest] ?? $attach[$closest] = new SplObjectStorage();
$detach[$cluster] ?? $detach[$cluster] = new SplObjectStorage();
$attach[$closest]->attach($point);
$detach[$cluster]->attach($point);
$convergence = false;
}
}
}
/** @var Cluster $cluster */
foreach ($attach as $cluster) {
$cluster->attachAll($attach[$cluster]);
}
/** @var Cluster $cluster */
foreach ($detach as $cluster) {
$cluster->detachAll($detach[$cluster]);
}
foreach ($clusters as $cluster) {
$cluster->updateCentroid();
}
return $convergence;
}
/**
* @return Cluster[]
*/
protected function initializeKMPPClusters(int $clustersNumber): array
{
$clusters = [];
$this->rewind();
/** @var Point $current */
$current = $this->current();
$clusters[] = new Cluster($this, $current->getCoordinates());
$distances = new SplObjectStorage();
for ($i = 1; $i < $clustersNumber; ++$i) {
$sum = 0;
/** @var Point $point */
foreach ($this as $point) {
$closest = $point->getClosest($clusters);
if ($closest === null) {
continue;
}
$distance = $point->getDistanceWith($closest);
$sum += $distances[$point] = $distance;
}
$sum = random_int(0, (int) $sum);
/** @var Point $point */
foreach ($this as $point) {
$sum -= $distances[$point];
if ($sum > 0) {
continue;
}
$clusters[] = new Cluster($this, $point->getCoordinates());
break;
}
}
return $clusters;
}
/**
* @return Cluster[]
*/
private function initializeRandomClusters(int $clustersNumber): array
{
$clusters = [];
[$min, $max] = $this->getBoundaries();
for ($n = 0; $n < $clustersNumber; ++$n) {
$clusters[] = new Cluster($this, $this->getRandomPoint($min, $max)->getCoordinates());
}
return $clusters;
}
}

View File

@ -0,0 +1,26 @@
<?php
declare(strict_types=1);
namespace Phpml\CrossValidation;
use Phpml\Dataset\Dataset;
class RandomSplit extends Split
{
protected function splitDataset(Dataset $dataset, float $testSize): void
{
$samples = $dataset->getSamples();
$labels = $dataset->getTargets();
$datasetSize = count($samples);
$testCount = count($this->testSamples);
for ($i = $datasetSize; $i > 0; --$i) {
$key = mt_rand(0, $datasetSize - 1);
$setName = (count($this->testSamples) - $testCount) / $datasetSize >= $testSize ? 'train' : 'test';
$this->{$setName.'Samples'}[] = $samples[$key];
$this->{$setName.'Labels'}[] = $labels[$key];
}
}
}

View File

@ -0,0 +1,73 @@
<?php
declare(strict_types=1);
namespace Phpml\CrossValidation;
use Phpml\Dataset\Dataset;
use Phpml\Exception\InvalidArgumentException;
abstract class Split
{
/**
* @var array
*/
protected $trainSamples = [];
/**
* @var array
*/
protected $testSamples = [];
/**
* @var array
*/
protected $trainLabels = [];
/**
* @var array
*/
protected $testLabels = [];
public function __construct(Dataset $dataset, float $testSize = 0.3, ?int $seed = null)
{
if ($testSize <= 0 || $testSize >= 1) {
throw new InvalidArgumentException('testsize must be between 0.0 and 1.0');
}
$this->seedGenerator($seed);
$this->splitDataset($dataset, $testSize);
}
public function getTrainSamples(): array
{
return $this->trainSamples;
}
public function getTestSamples(): array
{
return $this->testSamples;
}
public function getTrainLabels(): array
{
return $this->trainLabels;
}
public function getTestLabels(): array
{
return $this->testLabels;
}
abstract protected function splitDataset(Dataset $dataset, float $testSize): void;
protected function seedGenerator(?int $seed = null): void
{
if ($seed === null) {
mt_srand();
} else {
mt_srand($seed);
}
}
}

View File

@ -0,0 +1,49 @@
<?php
declare(strict_types=1);
namespace Phpml\CrossValidation;
use Phpml\Dataset\ArrayDataset;
use Phpml\Dataset\Dataset;
class StratifiedRandomSplit extends RandomSplit
{
protected function splitDataset(Dataset $dataset, float $testSize): void
{
$datasets = $this->splitByTarget($dataset);
foreach ($datasets as $targetSet) {
parent::splitDataset($targetSet, $testSize);
}
}
/**
* @return Dataset[]
*/
private function splitByTarget(Dataset $dataset): array
{
$targets = $dataset->getTargets();
$samples = $dataset->getSamples();
$uniqueTargets = array_unique($targets);
/** @var array $split */
$split = array_combine($uniqueTargets, array_fill(0, count($uniqueTargets), []));
foreach ($samples as $key => $sample) {
$split[$targets[$key]][] = $sample;
}
return $this->createDatasets($uniqueTargets, $split);
}
private function createDatasets(array $uniqueTargets, array $split): array
{
$datasets = [];
foreach ($uniqueTargets as $target) {
$datasets[$target] = new ArrayDataset($split[$target], array_fill(0, count($split[$target]), $target));
}
return $datasets;
}
}

View File

@ -0,0 +1,62 @@
<?php
declare(strict_types=1);
namespace Phpml\Dataset;
use Phpml\Exception\InvalidArgumentException;
class ArrayDataset implements Dataset
{
/**
* @var array
*/
protected $samples = [];
/**
* @var array
*/
protected $targets = [];
/**
* @throws InvalidArgumentException
*/
public function __construct(array $samples, array $targets)
{
if (count($samples) !== count($targets)) {
throw new InvalidArgumentException('Size of given arrays does not match');
}
$this->samples = $samples;
$this->targets = $targets;
}
public function getSamples(): array
{
return $this->samples;
}
public function getTargets(): array
{
return $this->targets;
}
/**
* @param int[] $columns
*/
public function removeColumns(array $columns): void
{
foreach ($this->samples as &$sample) {
$this->removeColumnsFromSample($sample, $columns);
}
}
private function removeColumnsFromSample(array &$sample, array $columns): void
{
foreach ($columns as $index) {
unset($sample[$index]);
}
$sample = array_values($sample);
}
}

View File

@ -0,0 +1,52 @@
<?php
declare(strict_types=1);
namespace Phpml\Dataset;
use Phpml\Exception\FileException;
class CsvDataset extends ArrayDataset
{
/**
* @var array
*/
protected $columnNames = [];
/**
* @throws FileException
*/
public function __construct(string $filepath, int $features, bool $headingRow = true, string $delimiter = ',', int $maxLineLength = 0)
{
if (!file_exists($filepath)) {
throw new FileException(sprintf('File "%s" missing.', basename($filepath)));
}
$handle = fopen($filepath, 'rb');
if ($handle === false) {
throw new FileException(sprintf('File "%s" can\'t be open.', basename($filepath)));
}
if ($headingRow) {
$data = fgetcsv($handle, $maxLineLength, $delimiter);
$this->columnNames = array_slice((array) $data, 0, $features);
} else {
$this->columnNames = range(0, $features - 1);
}
$samples = $targets = [];
while ($data = fgetcsv($handle, $maxLineLength, $delimiter)) {
$samples[] = array_slice($data, 0, $features);
$targets[] = $data[$features];
}
fclose($handle);
parent::__construct($samples, $targets);
}
public function getColumnNames(): array
{
return $this->columnNames;
}
}

12
src/Dataset/Dataset.php Normal file
View File

@ -0,0 +1,12 @@
<?php
declare(strict_types=1);
namespace Phpml\Dataset;
interface Dataset
{
public function getSamples(): array;
public function getTargets(): array;
}

View File

@ -18,11 +18,11 @@ use Phpml\Dataset\CsvDataset;
* Samples total: 214 * Samples total: 214
* Features per sample: 9. * Features per sample: 9.
*/ */
class Glass extends CsvDataset class GlassDataset extends CsvDataset
{ {
public function __construct() public function __construct()
{ {
$filepath = dirname(__FILE__).'/../../../../data/glass.csv'; $filepath = __DIR__.'/../../../data/glass.csv';
parent::__construct($filepath, 9, true); parent::__construct($filepath, 9, true);
} }
} }

View File

@ -12,11 +12,11 @@ use Phpml\Dataset\CsvDataset;
* Samples total: 150 * Samples total: 150
* Features per sample: 4. * Features per sample: 4.
*/ */
class Iris extends CsvDataset class IrisDataset extends CsvDataset
{ {
public function __construct() public function __construct()
{ {
$filepath = dirname(__FILE__).'/../../../../data/iris.csv'; $filepath = __DIR__.'/../../../data/iris.csv';
parent::__construct($filepath, 4, true); parent::__construct($filepath, 4, true);
} }
} }

View File

@ -12,11 +12,11 @@ use Phpml\Dataset\CsvDataset;
* Samples total: 178 * Samples total: 178
* Features per sample: 13. * Features per sample: 13.
*/ */
class Wine extends CsvDataset class WineDataset extends CsvDataset
{ {
public function __construct() public function __construct()
{ {
$filepath = dirname(__FILE__).'/../../../../data/wine.csv'; $filepath = __DIR__.'/../../../data/wine.csv';
parent::__construct($filepath, 13, true); parent::__construct($filepath, 13, true);
} }
} }

View File

@ -0,0 +1,47 @@
<?php
declare(strict_types=1);
namespace Phpml\Dataset;
use Phpml\Exception\DatasetException;
class FilesDataset extends ArrayDataset
{
public function __construct(string $rootPath)
{
if (!is_dir($rootPath)) {
throw new DatasetException(sprintf('Dataset root folder "%s" missing.', $rootPath));
}
$this->scanRootPath($rootPath);
}
private function scanRootPath(string $rootPath): void
{
$dirs = glob($rootPath.DIRECTORY_SEPARATOR.'*', GLOB_ONLYDIR);
if ($dirs === false) {
throw new DatasetException(sprintf('An error occurred during directory "%s" scan', $rootPath));
}
foreach ($dirs as $dir) {
$this->scanDir($dir);
}
}
private function scanDir(string $dir): void
{
$target = basename($dir);
$files = glob($dir.DIRECTORY_SEPARATOR.'*');
if ($files === false) {
return;
}
foreach (array_filter($files, 'is_file') as $file) {
$this->samples[] = file_get_contents($file);
$this->targets[] = $target;
}
}
}

View File

@ -0,0 +1,101 @@
<?php
declare(strict_types=1);
namespace Phpml\Dataset;
use Phpml\Exception\InvalidArgumentException;
/**
* MNIST dataset: http://yann.lecun.com/exdb/mnist/
* original mnist dataset reader: https://github.com/AndrewCarterUK/mnist-neural-network-plain-php
*/
final class MnistDataset extends ArrayDataset
{
private const MAGIC_IMAGE = 0x00000803;
private const MAGIC_LABEL = 0x00000801;
private const IMAGE_ROWS = 28;
private const IMAGE_COLS = 28;
public function __construct(string $imagePath, string $labelPath)
{
$this->samples = $this->readImages($imagePath);
$this->targets = $this->readLabels($labelPath);
if (count($this->samples) !== count($this->targets)) {
throw new InvalidArgumentException('Must have the same number of images and labels');
}
}
private function readImages(string $imagePath): array
{
$stream = fopen($imagePath, 'rb');
if ($stream === false) {
throw new InvalidArgumentException('Could not open file: '.$imagePath);
}
$images = [];
try {
$header = fread($stream, 16);
$fields = unpack('Nmagic/Nsize/Nrows/Ncols', (string) $header);
if ($fields['magic'] !== self::MAGIC_IMAGE) {
throw new InvalidArgumentException('Invalid magic number: '.$imagePath);
}
if ($fields['rows'] != self::IMAGE_ROWS) {
throw new InvalidArgumentException('Invalid number of image rows: '.$imagePath);
}
if ($fields['cols'] != self::IMAGE_COLS) {
throw new InvalidArgumentException('Invalid number of image cols: '.$imagePath);
}
for ($i = 0; $i < $fields['size']; $i++) {
$imageBytes = fread($stream, $fields['rows'] * $fields['cols']);
// Convert to float between 0 and 1
$images[] = array_map(function ($b) {
return $b / 255;
}, array_values(unpack('C*', (string) $imageBytes)));
}
} finally {
fclose($stream);
}
return $images;
}
private function readLabels(string $labelPath): array
{
$stream = fopen($labelPath, 'rb');
if ($stream === false) {
throw new InvalidArgumentException('Could not open file: '.$labelPath);
}
$labels = [];
try {
$header = fread($stream, 8);
$fields = unpack('Nmagic/Nsize', (string) $header);
if ($fields['magic'] !== self::MAGIC_LABEL) {
throw new InvalidArgumentException('Invalid magic number: '.$labelPath);
}
$labels = fread($stream, $fields['size']);
} finally {
fclose($stream);
}
return array_values(unpack('C*', (string) $labels));
}
}

131
src/Dataset/SvmDataset.php Normal file
View File

@ -0,0 +1,131 @@
<?php
declare(strict_types=1);
namespace Phpml\Dataset;
use Phpml\Exception\DatasetException;
use Phpml\Exception\FileException;
class SvmDataset extends ArrayDataset
{
public function __construct(string $filePath)
{
[$samples, $targets] = self::readProblem($filePath);
parent::__construct($samples, $targets);
}
private static function readProblem(string $filePath): array
{
$handle = self::openFile($filePath);
$samples = [];
$targets = [];
$maxIndex = 0;
while (false !== $line = fgets($handle)) {
[$sample, $target, $maxIndex] = self::processLine($line, $maxIndex);
$samples[] = $sample;
$targets[] = $target;
}
fclose($handle);
foreach ($samples as &$sample) {
$sample = array_pad($sample, $maxIndex + 1, 0);
}
return [$samples, $targets];
}
/**
* @return resource
*/
private static function openFile(string $filePath)
{
if (!file_exists($filePath)) {
throw new FileException(sprintf('File "%s" missing.', basename($filePath)));
}
$handle = fopen($filePath, 'rb');
if ($handle === false) {
throw new FileException(sprintf('File "%s" can\'t be open.', basename($filePath)));
}
return $handle;
}
private static function processLine(string $line, int $maxIndex): array
{
$columns = self::parseLine($line);
$target = self::parseTargetColumn($columns[0]);
$sample = array_fill(0, $maxIndex + 1, 0);
$n = count($columns);
for ($i = 1; $i < $n; ++$i) {
[$index, $value] = self::parseFeatureColumn($columns[$i]);
if ($index > $maxIndex) {
$maxIndex = $index;
$sample = array_pad($sample, $maxIndex + 1, 0);
}
$sample[$index] = $value;
}
return [$sample, $target, $maxIndex];
}
private static function parseLine(string $line): array
{
$line = explode('#', $line, 2)[0];
$line = rtrim($line);
$line = str_replace("\t", ' ', $line);
return explode(' ', $line);
}
private static function parseTargetColumn(string $column): float
{
if (!is_numeric($column)) {
throw new DatasetException(sprintf('Invalid target "%s".', $column));
}
return (float) $column;
}
private static function parseFeatureColumn(string $column): array
{
$feature = explode(':', $column, 2);
if (count($feature) !== 2) {
throw new DatasetException(sprintf('Invalid value "%s".', $column));
}
$index = self::parseFeatureIndex($feature[0]);
$value = self::parseFeatureValue($feature[1]);
return [$index, $value];
}
private static function parseFeatureIndex(string $index): int
{
if (!is_numeric($index) || !ctype_digit($index)) {
throw new DatasetException(sprintf('Invalid index "%s".', $index));
}
if ((int) $index < 1) {
throw new DatasetException(sprintf('Invalid index "%s".', $index));
}
return (int) $index - 1;
}
private static function parseFeatureValue(string $value): float
{
if (!is_numeric($value)) {
throw new DatasetException(sprintf('Invalid value "%s".', $value));
}
return (float) $value;
}
}

View File

@ -0,0 +1,94 @@
<?php
declare(strict_types=1);
namespace Phpml\DimensionReduction;
use Phpml\Math\LinearAlgebra\EigenvalueDecomposition;
use Phpml\Math\Matrix;
/**
* Class to compute eigen pairs (values & vectors) of a given matrix
* with the consideration of numFeatures or totalVariance to be preserved
*
* @author hp
*/
abstract class EigenTransformerBase
{
/**
* Total variance to be conserved after the reduction
*
* @var float
*/
public $totalVariance = 0.9;
/**
* Number of features to be preserved after the reduction
*
* @var int
*/
public $numFeatures = null;
/**
* Top eigenvectors of the matrix
*
* @var array
*/
protected $eigVectors = [];
/**
* Top eigenValues of the matrix
*
* @var array
*/
protected $eigValues = [];
/**
* Calculates eigenValues and eigenVectors of the given matrix. Returns
* top eigenVectors along with the largest eigenValues. The total explained variance
* of these eigenVectors will be no less than desired $totalVariance value
*/
protected function eigenDecomposition(array $matrix): void
{
$eig = new EigenvalueDecomposition($matrix);
$eigVals = $eig->getRealEigenvalues();
$eigVects = $eig->getEigenvectors();
$totalEigVal = array_sum($eigVals);
// Sort eigenvalues in descending order
arsort($eigVals);
$explainedVar = 0.0;
$vectors = [];
$values = [];
foreach ($eigVals as $i => $eigVal) {
$explainedVar += $eigVal / $totalEigVal;
$vectors[] = $eigVects[$i];
$values[] = $eigVal;
if ($this->numFeatures !== null) {
if (count($vectors) == $this->numFeatures) {
break;
}
} else {
if ($explainedVar >= $this->totalVariance) {
break;
}
}
}
$this->eigValues = $values;
$this->eigVectors = $vectors;
}
/**
* Returns the reduced data
*/
protected function reduce(array $data): array
{
$m1 = new Matrix($data);
$m2 = new Matrix($this->eigVectors);
return $m1->multiply($m2->transpose())->toArray();
}
}

View File

@ -0,0 +1,234 @@
<?php
declare(strict_types=1);
namespace Phpml\DimensionReduction;
use Closure;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Exception\InvalidOperationException;
use Phpml\Math\Distance\Euclidean;
use Phpml\Math\Distance\Manhattan;
use Phpml\Math\Matrix;
class KernelPCA extends PCA
{
public const KERNEL_RBF = 1;
public const KERNEL_SIGMOID = 2;
public const KERNEL_LAPLACIAN = 3;
public const KERNEL_LINEAR = 4;
/**
* Selected kernel function
*
* @var int
*/
protected $kernel;
/**
* Gamma value used by the kernel
*
* @var float|null
*/
protected $gamma;
/**
* Original dataset used to fit KernelPCA
*
* @var array
*/
protected $data = [];
/**
* Kernel principal component analysis (KernelPCA) is an extension of PCA using
* techniques of kernel methods. It is more suitable for data that involves
* vectors that are not linearly separable<br><br>
* Example: <b>$kpca = new KernelPCA(KernelPCA::KERNEL_RBF, null, 2, 15.0);</b>
* will initialize the algorithm with an RBF kernel having the gamma parameter as 15,0. <br>
* This transformation will return the same number of rows with only <i>2</i> columns.
*
* @param float $totalVariance Total variance to be preserved if numFeatures is not given
* @param int $numFeatures Number of columns to be returned
* @param float $gamma Gamma parameter is used with RBF and Sigmoid kernels
*
* @throws InvalidArgumentException
*/
public function __construct(int $kernel = self::KERNEL_RBF, ?float $totalVariance = null, ?int $numFeatures = null, ?float $gamma = null)
{
if (!in_array($kernel, [self::KERNEL_RBF, self::KERNEL_SIGMOID, self::KERNEL_LAPLACIAN, self::KERNEL_LINEAR], true)) {
throw new InvalidArgumentException('KernelPCA can be initialized with the following kernels only: Linear, RBF, Sigmoid and Laplacian');
}
parent::__construct($totalVariance, $numFeatures);
$this->kernel = $kernel;
$this->gamma = $gamma;
}
/**
* Takes a data and returns a lower dimensional version
* of this data while preserving $totalVariance or $numFeatures. <br>
* $data is an n-by-m matrix and returned array is
* n-by-k matrix where k <= m
*/
public function fit(array $data): array
{
$numRows = count($data);
$this->data = $data;
if ($this->gamma === null) {
$this->gamma = 1.0 / $numRows;
}
$matrix = $this->calculateKernelMatrix($this->data, $numRows);
$matrix = $this->centerMatrix($matrix, $numRows);
$this->eigenDecomposition($matrix);
$this->fit = true;
return Matrix::transposeArray($this->eigVectors);
}
/**
* Transforms the given sample to a lower dimensional vector by using
* the variables obtained during the last run of <code>fit</code>.
*
* @throws InvalidArgumentException
* @throws InvalidOperationException
*/
public function transform(array $sample): array
{
if (!$this->fit) {
throw new InvalidOperationException('KernelPCA has not been fitted with respect to original dataset, please run KernelPCA::fit() first');
}
if (is_array($sample[0])) {
throw new InvalidArgumentException('KernelPCA::transform() accepts only one-dimensional arrays');
}
$pairs = $this->getDistancePairs($sample);
return $this->projectSample($pairs);
}
/**
* Calculates similarity matrix by use of selected kernel function<br>
* An n-by-m matrix is given and an n-by-n matrix is returned
*/
protected function calculateKernelMatrix(array $data, int $numRows): array
{
$kernelFunc = $this->getKernel();
$matrix = [];
for ($i = 0; $i < $numRows; ++$i) {
for ($k = 0; $k < $numRows; ++$k) {
if ($i <= $k) {
$matrix[$i][$k] = $kernelFunc($data[$i], $data[$k]);
} else {
$matrix[$i][$k] = $matrix[$k][$i];
}
}
}
return $matrix;
}
/**
* Kernel matrix is centered in its original space by using the following
* conversion:
*
* K = K N.K K.N + N.K.N where N is n-by-n matrix filled with 1/n
*/
protected function centerMatrix(array $matrix, int $n): array
{
$N = array_fill(0, $n, array_fill(0, $n, 1.0 / $n));
$N = new Matrix($N, false);
$K = new Matrix($matrix, false);
// K.N (This term is repeated so we cache it once)
$K_N = $K->multiply($N);
// N.K
$N_K = $N->multiply($K);
// N.K.N
$N_K_N = $N->multiply($K_N);
return $K->subtract($N_K)
->subtract($K_N)
->add($N_K_N)
->toArray();
}
/**
* Returns the callable kernel function
*
* @throws \Exception
*/
protected function getKernel(): Closure
{
switch ($this->kernel) {
case self::KERNEL_LINEAR:
// k(x,y) = xT.y
return function ($x, $y) {
return Matrix::dot($x, $y)[0];
};
case self::KERNEL_RBF:
// k(x,y)=exp(-γ.|x-y|) where |..| is Euclidean distance
$dist = new Euclidean();
return function ($x, $y) use ($dist): float {
return exp(-$this->gamma * $dist->sqDistance($x, $y));
};
case self::KERNEL_SIGMOID:
// k(x,y)=tanh(γ.xT.y+c0) where c0=1
return function ($x, $y): float {
$res = Matrix::dot($x, $y)[0] + 1.0;
return tanh((float) $this->gamma * $res);
};
case self::KERNEL_LAPLACIAN:
// k(x,y)=exp(-γ.|x-y|) where |..| is Manhattan distance
$dist = new Manhattan();
return function ($x, $y) use ($dist): float {
return exp(-$this->gamma * $dist->distance($x, $y));
};
default:
// Not reached
throw new InvalidArgumentException(sprintf('KernelPCA initialized with invalid kernel: %d', $this->kernel));
}
}
protected function getDistancePairs(array $sample): array
{
$kernel = $this->getKernel();
$pairs = [];
foreach ($this->data as $row) {
$pairs[] = $kernel($row, $sample);
}
return $pairs;
}
protected function projectSample(array $pairs): array
{
// Normalize eigenvectors by eig = eigVectors / eigValues
$func = function ($eigVal, $eigVect) {
$m = new Matrix($eigVect, false);
$a = $m->divideByScalar($eigVal)->toArray();
return $a[0];
};
$eig = array_map($func, $this->eigValues, $this->eigVectors);
// return k.dot(eig)
return Matrix::dot($pairs, $eig);
}
}

View File

@ -0,0 +1,223 @@
<?php
declare(strict_types=1);
namespace Phpml\DimensionReduction;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Exception\InvalidOperationException;
use Phpml\Math\Matrix;
class LDA extends EigenTransformerBase
{
/**
* @var bool
*/
public $fit = false;
/**
* @var array
*/
public $labels = [];
/**
* @var array
*/
public $means = [];
/**
* @var array
*/
public $counts = [];
/**
* @var float[]
*/
public $overallMean = [];
/**
* Linear Discriminant Analysis (LDA) is used to reduce the dimensionality
* of the data. Unlike Principal Component Analysis (PCA), it is a supervised
* technique that requires the class labels in order to fit the data to a
* lower dimensional space. <br><br>
* The algorithm can be initialized by speciyfing
* either with the totalVariance(a value between 0.1 and 0.99)
* or numFeatures (number of features in the dataset) to be preserved.
*
* @param float|null $totalVariance Total explained variance to be preserved
* @param int|null $numFeatures Number of features to be preserved
*
* @throws InvalidArgumentException
*/
public function __construct(?float $totalVariance = null, ?int $numFeatures = null)
{
if ($totalVariance !== null && ($totalVariance < 0.1 || $totalVariance > 0.99)) {
throw new InvalidArgumentException('Total variance can be a value between 0.1 and 0.99');
}
if ($numFeatures !== null && $numFeatures <= 0) {
throw new InvalidArgumentException('Number of features to be preserved should be greater than 0');
}
if (($totalVariance !== null) === ($numFeatures !== null)) {
throw new InvalidArgumentException('Either totalVariance or numFeatures should be specified in order to run the algorithm');
}
if ($numFeatures !== null) {
$this->numFeatures = $numFeatures;
}
if ($totalVariance !== null) {
$this->totalVariance = $totalVariance;
}
}
/**
* Trains the algorithm to transform the given data to a lower dimensional space.
*/
public function fit(array $data, array $classes): array
{
$this->labels = $this->getLabels($classes);
$this->means = $this->calculateMeans($data, $classes);
$sW = $this->calculateClassVar($data, $classes);
$sB = $this->calculateClassCov();
$S = $sW->inverse()->multiply($sB);
$this->eigenDecomposition($S->toArray());
$this->fit = true;
return $this->reduce($data);
}
/**
* Transforms the given sample to a lower dimensional vector by using
* the eigenVectors obtained in the last run of <code>fit</code>.
*
* @throws InvalidOperationException
*/
public function transform(array $sample): array
{
if (!$this->fit) {
throw new InvalidOperationException('LDA has not been fitted with respect to original dataset, please run LDA::fit() first');
}
if (!is_array($sample[0])) {
$sample = [$sample];
}
return $this->reduce($sample);
}
/**
* Returns unique labels in the dataset
*/
protected function getLabels(array $classes): array
{
$counts = array_count_values($classes);
return array_keys($counts);
}
/**
* Calculates mean of each column for each class and returns
* n by m matrix where n is number of labels and m is number of columns
*/
protected function calculateMeans(array $data, array $classes): array
{
$means = [];
$counts = [];
$overallMean = array_fill(0, count($data[0]), 0.0);
foreach ($data as $index => $row) {
$label = array_search($classes[$index], $this->labels, true);
foreach ($row as $col => $val) {
if (!isset($means[$label][$col])) {
$means[$label][$col] = 0.0;
}
$means[$label][$col] += $val;
$overallMean[$col] += $val;
}
if (!isset($counts[$label])) {
$counts[$label] = 0;
}
++$counts[$label];
}
foreach ($means as $index => $row) {
foreach ($row as $col => $sum) {
$means[$index][$col] = $sum / $counts[$index];
}
}
// Calculate overall mean of the dataset for each column
$numElements = array_sum($counts);
$map = function ($el) use ($numElements) {
return $el / $numElements;
};
$this->overallMean = array_map($map, $overallMean);
$this->counts = $counts;
return $means;
}
/**
* Returns in-class scatter matrix for each class, which
* is a n by m matrix where n is number of classes and
* m is number of columns
*/
protected function calculateClassVar(array $data, array $classes): Matrix
{
// s is an n (number of classes) by m (number of column) matrix
$s = array_fill(0, count($data[0]), array_fill(0, count($data[0]), 0));
$sW = new Matrix($s, false);
foreach ($data as $index => $row) {
$label = array_search($classes[$index], $this->labels, true);
$means = $this->means[$label];
$row = $this->calculateVar($row, $means);
$sW = $sW->add($row);
}
return $sW;
}
/**
* Returns between-class scatter matrix for each class, which
* is an n by m matrix where n is number of classes and
* m is number of columns
*/
protected function calculateClassCov(): Matrix
{
// s is an n (number of classes) by m (number of column) matrix
$s = array_fill(0, count($this->overallMean), array_fill(0, count($this->overallMean), 0));
$sB = new Matrix($s, false);
foreach ($this->means as $index => $classMeans) {
$row = $this->calculateVar($classMeans, $this->overallMean);
$N = $this->counts[$index];
$sB = $sB->add($row->multiplyByScalar($N));
}
return $sB;
}
/**
* Returns the result of the calculation (x - m)T.(x - m)
*/
protected function calculateVar(array $row, array $means): Matrix
{
$x = new Matrix($row, false);
$m = new Matrix($means, false);
$diff = $x->subtract($m);
return $diff->transpose()->multiply($diff);
}
}

View File

@ -0,0 +1,131 @@
<?php
declare(strict_types=1);
namespace Phpml\DimensionReduction;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Exception\InvalidOperationException;
use Phpml\Math\Statistic\Covariance;
use Phpml\Math\Statistic\Mean;
class PCA extends EigenTransformerBase
{
/**
* Temporary storage for mean values for each dimension in given data
*
* @var array
*/
protected $means = [];
/**
* @var bool
*/
protected $fit = false;
/**
* PCA (Principal Component Analysis) used to explain given
* data with lower number of dimensions. This analysis transforms the
* data to a lower dimensional version of it by conserving a proportion of total variance
* within the data. It is a lossy data compression technique.<br>
*
* @param float $totalVariance Total explained variance to be preserved
* @param int $numFeatures Number of features to be preserved
*
* @throws InvalidArgumentException
*/
public function __construct(?float $totalVariance = null, ?int $numFeatures = null)
{
if ($totalVariance !== null && ($totalVariance < 0.1 || $totalVariance > 0.99)) {
throw new InvalidArgumentException('Total variance can be a value between 0.1 and 0.99');
}
if ($numFeatures !== null && $numFeatures <= 0) {
throw new InvalidArgumentException('Number of features to be preserved should be greater than 0');
}
if (($totalVariance !== null) === ($numFeatures !== null)) {
throw new InvalidArgumentException('Either totalVariance or numFeatures should be specified in order to run the algorithm');
}
if ($numFeatures !== null) {
$this->numFeatures = $numFeatures;
}
if ($totalVariance !== null) {
$this->totalVariance = $totalVariance;
}
}
/**
* Takes a data and returns a lower dimensional version
* of this data while preserving $totalVariance or $numFeatures. <br>
* $data is an n-by-m matrix and returned array is
* n-by-k matrix where k <= m
*/
public function fit(array $data): array
{
$n = count($data[0]);
$data = $this->normalize($data, $n);
$covMatrix = Covariance::covarianceMatrix($data, array_fill(0, $n, 0));
$this->eigenDecomposition($covMatrix);
$this->fit = true;
return $this->reduce($data);
}
/**
* Transforms the given sample to a lower dimensional vector by using
* the eigenVectors obtained in the last run of <code>fit</code>.
*
* @throws InvalidOperationException
*/
public function transform(array $sample): array
{
if (!$this->fit) {
throw new InvalidOperationException('PCA has not been fitted with respect to original dataset, please run PCA::fit() first');
}
if (!is_array($sample[0])) {
$sample = [$sample];
}
$sample = $this->normalize($sample, count($sample[0]));
return $this->reduce($sample);
}
protected function calculateMeans(array $data, int $n): void
{
// Calculate means for each dimension
$this->means = [];
for ($i = 0; $i < $n; ++$i) {
$column = array_column($data, $i);
$this->means[] = Mean::arithmetic($column);
}
}
/**
* Normalization of the data includes subtracting mean from
* each dimension therefore dimensions will be centered to zero
*/
protected function normalize(array $data, int $n): array
{
if (count($this->means) === 0) {
$this->calculateMeans($data, $n);
}
// Normalize data
foreach (array_keys($data) as $i) {
for ($k = 0; $k < $n; ++$k) {
$data[$i][$k] -= $this->means[$k];
}
}
return $data;
}
}

15
src/Estimator.php Normal file
View File

@ -0,0 +1,15 @@
<?php
declare(strict_types=1);
namespace Phpml;
interface Estimator
{
public function train(array $samples, array $targets): void;
/**
* @return mixed
*/
public function predict(array $samples);
}

Some files were not shown because too many files have changed in this diff Show More