mirror of
https://github.com/Llewellynvdm/php-ml.git
synced 2024-11-21 20:45:10 +00:00
Add SelectKBest docs
This commit is contained in:
parent
83b1d7c9ac
commit
451f84c2e6
@ -6,7 +6,7 @@
|
||||
[![Documentation Status](https://readthedocs.org/projects/php-ml/badge/?version=master)](http://php-ml.readthedocs.org/)
|
||||
[![Total Downloads](https://poser.pugx.org/php-ai/php-ml/downloads.svg)](https://packagist.org/packages/php-ai/php-ml)
|
||||
[![License](https://poser.pugx.org/php-ai/php-ml/license.svg)](https://packagist.org/packages/php-ai/php-ml)
|
||||
[![Coverage Status](https://coveralls.io/repos/github/php-ai/php-ml/badge.svg?branch=coveralls)](https://coveralls.io/github/php-ai/php-ml?branch=coveralls)
|
||||
[![Coverage Status](https://coveralls.io/repos/github/php-ai/php-ml/badge.svg?branch=master)](https://coveralls.io/github/php-ai/php-ml?branch=master)
|
||||
[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/php-ai/php-ml/?branch=master)
|
||||
|
||||
<a href="http://www.yegor256.com/2016/10/23/award-2017.html">
|
||||
@ -89,6 +89,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples](
|
||||
* [Stratified Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/stratified-random-split/)
|
||||
* Feature Selection
|
||||
* [Variance Threshold](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-selection/variance-threshold/)
|
||||
* [SelectKBest](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-selection/selectkbest/)
|
||||
* Preprocessing
|
||||
* [Normalization](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/normalization/)
|
||||
* [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values/)
|
||||
|
@ -6,7 +6,7 @@
|
||||
[![Documentation Status](https://readthedocs.org/projects/php-ml/badge/?version=master)](http://php-ml.readthedocs.org/)
|
||||
[![Total Downloads](https://poser.pugx.org/php-ai/php-ml/downloads.svg)](https://packagist.org/packages/php-ai/php-ml)
|
||||
[![License](https://poser.pugx.org/php-ai/php-ml/license.svg)](https://packagist.org/packages/php-ai/php-ml)
|
||||
[![Coverage Status](https://coveralls.io/repos/github/php-ai/php-ml/badge.svg?branch=coveralls)](https://coveralls.io/github/php-ai/php-ml?branch=coveralls)
|
||||
[![Coverage Status](https://coveralls.io/repos/github/php-ai/php-ml/badge.svg?branch=master)](https://coveralls.io/github/php-ai/php-ml?branch=master)
|
||||
[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/php-ai/php-ml/?branch=master)
|
||||
|
||||
<a href="http://www.yegor256.com/2016/10/23/award-2017.html">
|
||||
@ -78,6 +78,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples](
|
||||
* [Stratified Random Split](machine-learning/cross-validation/stratified-random-split.md)
|
||||
* Feature Selection
|
||||
* [Variance Threshold](machine-learning/feature-selection/variance-threshold.md)
|
||||
* [SelectKBest](machine-learning/feature-selection/selectkbest.md)
|
||||
* Preprocessing
|
||||
* [Normalization](machine-learning/preprocessing/normalization.md)
|
||||
* [Imputation missing values](machine-learning/preprocessing/imputation-missing-values.md)
|
||||
|
96
docs/machine-learning/feature-selection/selectkbest.md
Normal file
96
docs/machine-learning/feature-selection/selectkbest.md
Normal file
@ -0,0 +1,96 @@
|
||||
# SelectKBest
|
||||
|
||||
`SelectKBest` - select features according to the k highest scores.
|
||||
|
||||
## Constructor Parameters
|
||||
|
||||
* $k (int) - number of top features to select, rest will be removed (default: 10)
|
||||
* $scoringFunction (ScoringFunction) - function that take samples and targets and return array with scores (default: ANOVAFValue)
|
||||
|
||||
```php
|
||||
use Phpml\FeatureSelection\SelectKBest;
|
||||
|
||||
$transformer = new SelectKBest(2);
|
||||
```
|
||||
|
||||
## Example of use
|
||||
|
||||
As an example we can perform feature selection on Iris dataset to retrieve only the two best features as follows:
|
||||
|
||||
```php
|
||||
use Phpml\FeatureSelection\SelectKBest;
|
||||
use Phpml\Dataset\Demo\IrisDataset;
|
||||
|
||||
$dataset = new IrisDataset();
|
||||
$selector = new SelectKBest(2);
|
||||
$selector->fit($samples = $dataset->getSamples(), $dataset->getTargets());
|
||||
$selector->transform($samples);
|
||||
|
||||
/*
|
||||
$samples[0] = [1.4, 0.2];
|
||||
*/
|
||||
```
|
||||
|
||||
## Scores
|
||||
|
||||
You can get a array with the calculated score for each feature.
|
||||
A higher value means that a given feature is better suited for learning.
|
||||
Of course, the rating depends on the scoring function used.
|
||||
|
||||
```
|
||||
use Phpml\FeatureSelection\SelectKBest;
|
||||
use Phpml\Dataset\Demo\IrisDataset;
|
||||
|
||||
$dataset = new IrisDataset();
|
||||
$selector = new SelectKBest(2);
|
||||
$selector->fit($samples = $dataset->getSamples(), $dataset->getTargets());
|
||||
$selector->scores();
|
||||
|
||||
/*
|
||||
..array(4) {
|
||||
[0]=>
|
||||
float(119.26450218451)
|
||||
[1]=>
|
||||
float(47.364461402997)
|
||||
[2]=>
|
||||
float(1179.0343277002)
|
||||
[3]=>
|
||||
float(959.32440572573)
|
||||
}
|
||||
*/
|
||||
```
|
||||
|
||||
## Scoring function
|
||||
|
||||
Available scoring functions:
|
||||
|
||||
For classification:
|
||||
- **ANOVAFValue**
|
||||
The one-way ANOVA tests the null hypothesis that 2 or more groups have the same population mean.
|
||||
The test is applied to samples from two or more groups, possibly with differing sizes.
|
||||
|
||||
For regression:
|
||||
- **UnivariateLinearRegression**
|
||||
Quick linear model for testing the effect of a single regressor, sequentially for many regressors.
|
||||
This is done in 2 steps:
|
||||
- 1. The cross correlation between each regressor and the target is computed, that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *std(y)).
|
||||
- 2. It is converted to an F score
|
||||
|
||||
## Pipeline
|
||||
|
||||
`SelectKBest` implements `Transformer` interface so it can be used as part of pipeline:
|
||||
|
||||
```php
|
||||
use Phpml\FeatureSelection\SelectKBest;
|
||||
use Phpml\Classification\SVC;
|
||||
use Phpml\FeatureExtraction\TfIdfTransformer;
|
||||
use Phpml\Pipeline;
|
||||
|
||||
$transformers = [
|
||||
new TfIdfTransformer(),
|
||||
new SelectKBest(3)
|
||||
];
|
||||
$estimator = new SVC();
|
||||
|
||||
$pipeline = new Pipeline($transformers, $estimator);
|
||||
```
|
@ -27,6 +27,7 @@ pages:
|
||||
- Stratified Random Split: machine-learning/cross-validation/stratified-random-split.md
|
||||
- Feature Selection:
|
||||
- VarianceThreshold: machine-learning/feature-selection/variance-threshold.md
|
||||
- SelectKBest: machine-learning/feature-selection/selectkbest.md
|
||||
- Preprocessing:
|
||||
- Normalization: machine-learning/preprocessing/normalization.md
|
||||
- Imputation missing values: machine-learning/preprocessing/imputation-missing-values.md
|
||||
|
@ -16,7 +16,7 @@ use Phpml\Math\Statistic\Mean;
|
||||
*
|
||||
* 1. The cross correlation between each regressor and the target is computed,
|
||||
* that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *std(y)).
|
||||
* 2. It is converted to an F score then to a p-value.
|
||||
* 2. It is converted to an F score.
|
||||
*
|
||||
* Ported from scikit-learn f_regression function (http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html#sklearn.feature_selection.f_regression)
|
||||
*/
|
||||
|
@ -7,7 +7,6 @@ namespace Phpml\Tests;
|
||||
use Phpml\Classification\SVC;
|
||||
use Phpml\FeatureExtraction\TfIdfTransformer;
|
||||
use Phpml\FeatureExtraction\TokenCountVectorizer;
|
||||
use Phpml\FeatureSelection\ScoringFunction\ANOVAFValue;
|
||||
use Phpml\FeatureSelection\SelectKBest;
|
||||
use Phpml\ModelManager;
|
||||
use Phpml\Pipeline;
|
||||
|
Loading…
Reference in New Issue
Block a user