mirror of
https://github.com/Llewellynvdm/php-ml.git
synced 2025-01-24 23:58:24 +00:00
Merge pull request #9 from php-ai/develop
Add missing docs and create changelog
This commit is contained in:
commit
9d900be0c6
17
CHANGELOG.md
Normal file
17
CHANGELOG.md
Normal file
@ -0,0 +1,17 @@
|
||||
CHANGELOG
|
||||
=========
|
||||
|
||||
This changelog references the relevant changes done in PHP-ML library.
|
||||
|
||||
* 0.2.0 (in plan)
|
||||
* feature [Dataset] - FileDataset - load dataset from files (folders as targets)
|
||||
* feature [Metric] - ClassificationReport - report about trained classifier
|
||||
|
||||
* 0.1.1 (2016-07-12)
|
||||
* feature [Cross Validation] Stratified Random Split - equal distribution for targets in split
|
||||
* feature [General] Documentation - add missing pages (Pipeline, ConfusionMatrix and TfIdfTransformer) and fix links
|
||||
|
||||
* 0.1.0 (2016-07-08)
|
||||
* first develop release
|
||||
* base tools for Machine Learning: Algorithms, Cross Validation, Preprocessing, Feature Extraction
|
||||
* bug [General] #7 - PHP-ML doesn't work on Mac
|
@ -48,6 +48,9 @@ composer require php-ai/php-ml
|
||||
* [DBSCAN](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/dbscan/)
|
||||
* Metric
|
||||
* [Accuracy](http://php-ml.readthedocs.io/en/latest/machine-learning/metric/accuracy/)
|
||||
* [Confusion Matrix](http://php-ml.readthedocs.io/en/latest/machine-learning/metric/confusion-matrix/)
|
||||
* Workflow
|
||||
* [Pipeline](http://php-ml.readthedocs.io/en/latest/machine-learning/workflow/pipeline)
|
||||
* Cross Validation
|
||||
* [Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/random-split/)
|
||||
* [Stratified Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/stratified-random-split/)
|
||||
@ -56,6 +59,7 @@ composer require php-ai/php-ml
|
||||
* [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values/)
|
||||
* Feature Extraction
|
||||
* [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/)
|
||||
* [Tf-idf Transformer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/tf-idf-transformer/)
|
||||
* Datasets
|
||||
* [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/)
|
||||
* Ready to use:
|
||||
|
@ -37,35 +37,39 @@ composer require php-ai/php-ml
|
||||
## Features
|
||||
|
||||
* Classification
|
||||
* [SVC](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/svc/)
|
||||
* [k-Nearest Neighbors](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/k-nearest-neighbors/)
|
||||
* [Naive Bayes](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/naive-bayes/)
|
||||
* [SVC](machine-learning/classification/svc/)
|
||||
* [k-Nearest Neighbors](machine-learning/classification/k-nearest-neighbors/)
|
||||
* [Naive Bayes](machine-learning/classification/naive-bayes/)
|
||||
* Regression
|
||||
* [Least Squares](http://php-ml.readthedocs.io/en/latest/machine-learning/regression/least-squares/)
|
||||
* [SVR](http://php-ml.readthedocs.io/en/latest/machine-learning/regression/svr/)
|
||||
* [Least Squares](machine-learning/regression/least-squares/)
|
||||
* [SVR](machine-learning/regression/svr/)
|
||||
* Clustering
|
||||
* [k-Means](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/k-means/)
|
||||
* [DBSCAN](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/dbscan/)
|
||||
* [k-Means](machine-learning/clustering/k-means/)
|
||||
* [DBSCAN](machine-learning/clustering/dbscan/)
|
||||
* Metric
|
||||
* [Accuracy](http://php-ml.readthedocs.io/en/latest/machine-learning/metric/accuracy/)
|
||||
* [Accuracy](machine-learning/metric/accuracy/)
|
||||
* [Confusion Matrix](machine-learning/metric/confusion-matrix/)
|
||||
* Workflow
|
||||
* [Pipeline](machine-learning/workflow/pipeline)
|
||||
* Cross Validation
|
||||
* [Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/random-split/)
|
||||
* [Stratified Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/stratified-random-split/)
|
||||
* [Random Split](machine-learning/cross-validation/random-split/)
|
||||
* [Stratified Random Split](machine-learning/cross-validation/stratified-random-split/)
|
||||
* Preprocessing
|
||||
* [Normalization](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/normalization/)
|
||||
* [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values/)
|
||||
* [Normalization](machine-learning/preprocessing/normalization/)
|
||||
* [Imputation missing values](machine-learning/preprocessing/imputation-missing-values/)
|
||||
* Feature Extraction
|
||||
* [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/)
|
||||
* [Token Count Vectorizer](machine-learning/feature-extraction/token-count-vectorizer/)
|
||||
* [Tf-idf Transformer](machine-learning/feature-extraction/tf-idf-transformer/)
|
||||
* Datasets
|
||||
* [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/)
|
||||
* [CSV](machine-learning/datasets/csv-dataset/)
|
||||
* Ready to use:
|
||||
* [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/)
|
||||
* [Wine](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/wine/)
|
||||
* [Glass](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/glass/)
|
||||
* [Iris](machine-learning/datasets/demo/iris/)
|
||||
* [Wine](machine-learning/datasets/demo/wine/)
|
||||
* [Glass](machine-learning/datasets/demo/glass/)
|
||||
* Math
|
||||
* [Distance](http://php-ml.readthedocs.io/en/latest/math/distance/)
|
||||
* [Matrix](http://php-ml.readthedocs.io/en/latest/math/matrix/)
|
||||
* [Statistic](http://php-ml.readthedocs.io/en/latest/math/statistic/)
|
||||
* [Distance](math/distance/)
|
||||
* [Matrix](math/matrix/)
|
||||
* [Statistic](math/statistic/)
|
||||
|
||||
|
||||
## Contribute
|
||||
|
@ -0,0 +1,42 @@
|
||||
# Tf-idf Transformer
|
||||
|
||||
Tf–idf, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.
|
||||
|
||||
### Constructor Parameters
|
||||
|
||||
* $samples (array) - samples for fit tf-idf model
|
||||
|
||||
```
|
||||
use Phpml\FeatureExtraction\TfIdfTransformer;
|
||||
|
||||
$samples = [
|
||||
[1, 2, 4],
|
||||
[0, 2, 1]
|
||||
];
|
||||
|
||||
$transformer = new TfIdfTransformer($samples);
|
||||
```
|
||||
|
||||
### Transformation
|
||||
|
||||
To transform a collection of text samples use `transform` method. Example:
|
||||
|
||||
```
|
||||
use Phpml\FeatureExtraction\TfIdfTransformer;
|
||||
|
||||
$samples = [
|
||||
[0 => 1, 1 => 1, 2 => 2, 3 => 1, 4 => 0, 5 => 0],
|
||||
[0 => 1, 1 => 1, 2 => 0, 3 => 0, 4 => 2, 5 => 3],
|
||||
];
|
||||
|
||||
$transformer = new TfIdfTransformer($samples);
|
||||
$transformer->transform($samples);
|
||||
|
||||
/*
|
||||
$samples = [
|
||||
[0 => 0, 1 => 0, 2 => 0.602, 3 => 0.301, 4 => 0, 5 => 0],
|
||||
[0 => 0, 1 => 0, 2 => 0, 3 => 0, 4 => 0.602, 5 => 0.903],
|
||||
];
|
||||
*/
|
||||
|
||||
```
|
44
docs/machine-learning/metric/confusion-matrix.md
Normal file
44
docs/machine-learning/metric/confusion-matrix.md
Normal file
@ -0,0 +1,44 @@
|
||||
# Confusion Matrix
|
||||
|
||||
Class for compute confusion matrix to evaluate the accuracy of a classification.
|
||||
|
||||
### Example (all targets)
|
||||
|
||||
Compute ConfusionMatrix for all targets.
|
||||
|
||||
```
|
||||
use Phpml\Metric\ConfusionMatrix;
|
||||
|
||||
$actualTargets = [2, 0, 2, 2, 0, 1];
|
||||
$predictedTargets = [0, 0, 2, 2, 0, 2];
|
||||
|
||||
$confusionMatrix = ConfusionMatrix::compute($actualTargets, $predictedTargets)
|
||||
|
||||
/*
|
||||
$confusionMatrix = [
|
||||
[2, 0, 0],
|
||||
[0, 0, 1],
|
||||
[1, 0, 2],
|
||||
];
|
||||
*/
|
||||
```
|
||||
|
||||
### Example (chosen targets)
|
||||
|
||||
Compute ConfusionMatrix for chosen targets.
|
||||
|
||||
```
|
||||
use Phpml\Metric\ConfusionMatrix;
|
||||
|
||||
$actualTargets = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird'];
|
||||
$predictedTargets = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat'];
|
||||
|
||||
$confusionMatrix = ConfusionMatrix::compute($actualTargets, $predictedTargets, ['ant', 'bird'])
|
||||
|
||||
/*
|
||||
$confusionMatrix = [
|
||||
[2, 0],
|
||||
[0, 0],
|
||||
];
|
||||
*/
|
||||
```
|
65
docs/machine-learning/workflow/pipeline.md
Normal file
65
docs/machine-learning/workflow/pipeline.md
Normal file
@ -0,0 +1,65 @@
|
||||
# Pipeline
|
||||
|
||||
In machine learning, it is common to run a sequence of algorithms to process and learn from dataset. For example:
|
||||
|
||||
* Split each document’s text into tokens.
|
||||
* Convert each document’s words into a numerical feature vector ([Token Count Vectorizer](machine-learning/feature-extraction/token-count-vectorizer/)).
|
||||
* Learn a prediction model using the feature vectors and labels.
|
||||
|
||||
PHP-ML represents such a workflow as a Pipeline, which consists sequence of transformers and a estimator.
|
||||
|
||||
|
||||
### Constructor Parameters
|
||||
|
||||
* $transformers (array|Transformer[]) - sequence of objects that implements Transformer interface
|
||||
* $estimator (Estimator) - estimator that can train and predict
|
||||
|
||||
```
|
||||
use Phpml\Classification\SVC;
|
||||
use Phpml\FeatureExtraction\TfIdfTransformer;
|
||||
use Phpml\Pipeline;
|
||||
|
||||
$transformers = [
|
||||
new TfIdfTransformer(),
|
||||
];
|
||||
$estimator = new SVC();
|
||||
|
||||
$pipeline = new Pipeline($transformers, $estimator);
|
||||
```
|
||||
|
||||
### Example
|
||||
|
||||
First our pipeline replace missing value, then normalize samples and finally train SVC estimator. Thus prepared pipeline repeats each transformation step for predicted sample.
|
||||
|
||||
```
|
||||
use Phpml\Classification\SVC;
|
||||
use Phpml\Pipeline;
|
||||
use Phpml\Preprocessing\Imputer;
|
||||
use Phpml\Preprocessing\Normalizer;
|
||||
use Phpml\Preprocessing\Imputer\Strategy\MostFrequentStrategy;
|
||||
|
||||
$transformers = [
|
||||
new Imputer(null, new MostFrequentStrategy()),
|
||||
new Normalizer(),
|
||||
];
|
||||
$estimator = new SVC();
|
||||
|
||||
$samples = [
|
||||
[1, -1, 2],
|
||||
[2, 0, null],
|
||||
[null, 1, -1],
|
||||
];
|
||||
|
||||
$targets = [
|
||||
4,
|
||||
1,
|
||||
4,
|
||||
];
|
||||
|
||||
$pipeline = new Pipeline($transformers, $estimator);
|
||||
$pipeline->train($samples, $targets);
|
||||
|
||||
$predicted = $pipeline->predict([[0, 0, 0]]);
|
||||
|
||||
// $predicted == 4
|
||||
```
|
@ -14,13 +14,18 @@ pages:
|
||||
- DBSCAN: machine-learning/clustering/dbscan.md
|
||||
- Metric:
|
||||
- Accuracy: machine-learning/metric/accuracy.md
|
||||
- Confusion Matrix: machine-learning/metric/confusion-matrix.md
|
||||
- Workflow:
|
||||
- Pipeline: machine-learning/workflow/pipeline.md
|
||||
- Cross Validation:
|
||||
- RandomSplit: machine-learning/cross-validation/random-split.md
|
||||
- Stratified Random Split: machine-learning/cross-validation/stratified-random-split.md
|
||||
- Preprocessing:
|
||||
- Normalization: machine-learning/preprocessing/normalization.md
|
||||
- Imputation missing values: machine-learning/preprocessing/imputation-missing-values.md
|
||||
- Feature Extraction:
|
||||
- Token Count Vectorizer: machine-learning/feature-extraction/token-count-vectorizer.md
|
||||
- Tf-idf Transformer: machine-learning/feature-extraction/tf-idf-transformer.md
|
||||
- Datasets:
|
||||
- Array Dataset: machine-learning/datasets/array-dataset.md
|
||||
- CSV Dataset: machine-learning/datasets/csv-dataset.md
|
||||
|
@ -10,7 +10,7 @@ class TfIdfTransformerTest extends \PHPUnit_Framework_TestCase
|
||||
{
|
||||
public function testTfIdfTransformation()
|
||||
{
|
||||
//https://en.wikipedia.org/wiki/Tf%E2%80%93idf
|
||||
// https://en.wikipedia.org/wiki/Tf-idf
|
||||
|
||||
$samples = [
|
||||
[0 => 1, 1 => 1, 2 => 2, 3 => 1, 4 => 0, 5 => 0],
|
||||
|
Loading…
x
Reference in New Issue
Block a user