diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..bc66065 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,17 @@ +CHANGELOG +========= + +This changelog references the relevant changes done in PHP-ML library. + +* 0.2.0 (in plan) + * feature [Dataset] - FileDataset - load dataset from files (folders as targets) + * feature [Metric] - ClassificationReport - report about trained classifier + +* 0.1.1 (2016-07-12) + * feature [Cross Validation] Stratified Random Split - equal distribution for targets in split + * feature [General] Documentation - add missing pages (Pipeline, ConfusionMatrix and TfIdfTransformer) and fix links + +* 0.1.0 (2016-07-08) + * first develop release + * base tools for Machine Learning: Algorithms, Cross Validation, Preprocessing, Feature Extraction + * bug [General] #7 - PHP-ML doesn't work on Mac diff --git a/README.md b/README.md index 1f7d97c..61d215a 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,9 @@ composer require php-ai/php-ml * [DBSCAN](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/dbscan/) * Metric * [Accuracy](http://php-ml.readthedocs.io/en/latest/machine-learning/metric/accuracy/) + * [Confusion Matrix](http://php-ml.readthedocs.io/en/latest/machine-learning/metric/confusion-matrix/) +* Workflow + * [Pipeline](http://php-ml.readthedocs.io/en/latest/machine-learning/workflow/pipeline) * Cross Validation * [Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/random-split/) * [Stratified Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/stratified-random-split/) @@ -56,6 +59,7 @@ composer require php-ai/php-ml * [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values/) * Feature Extraction * [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/) + * [Tf-idf Transformer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/tf-idf-transformer/) * Datasets * [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/) * Ready to use: diff --git a/docs/index.md b/docs/index.md index 7943c38..c3088e3 100644 --- a/docs/index.md +++ b/docs/index.md @@ -37,35 +37,39 @@ composer require php-ai/php-ml ## Features * Classification - * [SVC](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/svc/) - * [k-Nearest Neighbors](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/k-nearest-neighbors/) - * [Naive Bayes](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/naive-bayes/) + * [SVC](machine-learning/classification/svc/) + * [k-Nearest Neighbors](machine-learning/classification/k-nearest-neighbors/) + * [Naive Bayes](machine-learning/classification/naive-bayes/) * Regression - * [Least Squares](http://php-ml.readthedocs.io/en/latest/machine-learning/regression/least-squares/) - * [SVR](http://php-ml.readthedocs.io/en/latest/machine-learning/regression/svr/) + * [Least Squares](machine-learning/regression/least-squares/) + * [SVR](machine-learning/regression/svr/) * Clustering - * [k-Means](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/k-means/) - * [DBSCAN](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/dbscan/) + * [k-Means](machine-learning/clustering/k-means/) + * [DBSCAN](machine-learning/clustering/dbscan/) * Metric - * [Accuracy](http://php-ml.readthedocs.io/en/latest/machine-learning/metric/accuracy/) + * [Accuracy](machine-learning/metric/accuracy/) + * [Confusion Matrix](machine-learning/metric/confusion-matrix/) +* Workflow + * [Pipeline](machine-learning/workflow/pipeline) * Cross Validation - * [Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/random-split/) - * [Stratified Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/stratified-random-split/) + * [Random Split](machine-learning/cross-validation/random-split/) + * [Stratified Random Split](machine-learning/cross-validation/stratified-random-split/) * Preprocessing - * [Normalization](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/normalization/) - * [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values/) + * [Normalization](machine-learning/preprocessing/normalization/) + * [Imputation missing values](machine-learning/preprocessing/imputation-missing-values/) * Feature Extraction - * [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/) + * [Token Count Vectorizer](machine-learning/feature-extraction/token-count-vectorizer/) + * [Tf-idf Transformer](machine-learning/feature-extraction/tf-idf-transformer/) * Datasets - * [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/) + * [CSV](machine-learning/datasets/csv-dataset/) * Ready to use: - * [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/) - * [Wine](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/wine/) - * [Glass](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/glass/) + * [Iris](machine-learning/datasets/demo/iris/) + * [Wine](machine-learning/datasets/demo/wine/) + * [Glass](machine-learning/datasets/demo/glass/) * Math - * [Distance](http://php-ml.readthedocs.io/en/latest/math/distance/) - * [Matrix](http://php-ml.readthedocs.io/en/latest/math/matrix/) - * [Statistic](http://php-ml.readthedocs.io/en/latest/math/statistic/) + * [Distance](math/distance/) + * [Matrix](math/matrix/) + * [Statistic](math/statistic/) ## Contribute diff --git a/docs/machine-learning/feature-extraction/tf-idf-transformer.md b/docs/machine-learning/feature-extraction/tf-idf-transformer.md new file mode 100644 index 0000000..c592b8d --- /dev/null +++ b/docs/machine-learning/feature-extraction/tf-idf-transformer.md @@ -0,0 +1,42 @@ +# Tf-idf Transformer + +Tf–idf, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. + +### Constructor Parameters + +* $samples (array) - samples for fit tf-idf model + +``` +use Phpml\FeatureExtraction\TfIdfTransformer; + +$samples = [ + [1, 2, 4], + [0, 2, 1] +]; + +$transformer = new TfIdfTransformer($samples); +``` + +### Transformation + +To transform a collection of text samples use `transform` method. Example: + +``` +use Phpml\FeatureExtraction\TfIdfTransformer; + +$samples = [ + [0 => 1, 1 => 1, 2 => 2, 3 => 1, 4 => 0, 5 => 0], + [0 => 1, 1 => 1, 2 => 0, 3 => 0, 4 => 2, 5 => 3], +]; + +$transformer = new TfIdfTransformer($samples); +$transformer->transform($samples); + +/* +$samples = [ + [0 => 0, 1 => 0, 2 => 0.602, 3 => 0.301, 4 => 0, 5 => 0], + [0 => 0, 1 => 0, 2 => 0, 3 => 0, 4 => 0.602, 5 => 0.903], +]; +*/ + +``` diff --git a/docs/machine-learning/metric/confusion-matrix.md b/docs/machine-learning/metric/confusion-matrix.md new file mode 100644 index 0000000..b07443a --- /dev/null +++ b/docs/machine-learning/metric/confusion-matrix.md @@ -0,0 +1,44 @@ +# Confusion Matrix + +Class for compute confusion matrix to evaluate the accuracy of a classification. + +### Example (all targets) + +Compute ConfusionMatrix for all targets. + +``` +use Phpml\Metric\ConfusionMatrix; + +$actualTargets = [2, 0, 2, 2, 0, 1]; +$predictedTargets = [0, 0, 2, 2, 0, 2]; + +$confusionMatrix = ConfusionMatrix::compute($actualTargets, $predictedTargets) + +/* +$confusionMatrix = [ + [2, 0, 0], + [0, 0, 1], + [1, 0, 2], +]; +*/ +``` + +### Example (chosen targets) + +Compute ConfusionMatrix for chosen targets. + +``` +use Phpml\Metric\ConfusionMatrix; + +$actualTargets = ['cat', 'ant', 'cat', 'cat', 'ant', 'bird']; +$predictedTargets = ['ant', 'ant', 'cat', 'cat', 'ant', 'cat']; + +$confusionMatrix = ConfusionMatrix::compute($actualTargets, $predictedTargets, ['ant', 'bird']) + +/* +$confusionMatrix = [ + [2, 0], + [0, 0], +]; +*/ +``` diff --git a/docs/machine-learning/workflow/pipeline.md b/docs/machine-learning/workflow/pipeline.md new file mode 100644 index 0000000..34465eb --- /dev/null +++ b/docs/machine-learning/workflow/pipeline.md @@ -0,0 +1,65 @@ +# Pipeline + +In machine learning, it is common to run a sequence of algorithms to process and learn from dataset. For example: + + * Split each document’s text into tokens. + * Convert each document’s words into a numerical feature vector ([Token Count Vectorizer](machine-learning/feature-extraction/token-count-vectorizer/)). + * Learn a prediction model using the feature vectors and labels. + +PHP-ML represents such a workflow as a Pipeline, which consists sequence of transformers and a estimator. + + +### Constructor Parameters + +* $transformers (array|Transformer[]) - sequence of objects that implements Transformer interface +* $estimator (Estimator) - estimator that can train and predict + +``` +use Phpml\Classification\SVC; +use Phpml\FeatureExtraction\TfIdfTransformer; +use Phpml\Pipeline; + +$transformers = [ + new TfIdfTransformer(), +]; +$estimator = new SVC(); + +$pipeline = new Pipeline($transformers, $estimator); +``` + +### Example + +First our pipeline replace missing value, then normalize samples and finally train SVC estimator. Thus prepared pipeline repeats each transformation step for predicted sample. + +``` +use Phpml\Classification\SVC; +use Phpml\Pipeline; +use Phpml\Preprocessing\Imputer; +use Phpml\Preprocessing\Normalizer; +use Phpml\Preprocessing\Imputer\Strategy\MostFrequentStrategy; + +$transformers = [ + new Imputer(null, new MostFrequentStrategy()), + new Normalizer(), +]; +$estimator = new SVC(); + +$samples = [ + [1, -1, 2], + [2, 0, null], + [null, 1, -1], +]; + +$targets = [ + 4, + 1, + 4, +]; + +$pipeline = new Pipeline($transformers, $estimator); +$pipeline->train($samples, $targets); + +$predicted = $pipeline->predict([[0, 0, 0]]); + +// $predicted == 4 +``` diff --git a/mkdocs.yml b/mkdocs.yml index 68e8b97..2634101 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -14,13 +14,18 @@ pages: - DBSCAN: machine-learning/clustering/dbscan.md - Metric: - Accuracy: machine-learning/metric/accuracy.md + - Confusion Matrix: machine-learning/metric/confusion-matrix.md + - Workflow: + - Pipeline: machine-learning/workflow/pipeline.md - Cross Validation: - RandomSplit: machine-learning/cross-validation/random-split.md + - Stratified Random Split: machine-learning/cross-validation/stratified-random-split.md - Preprocessing: - Normalization: machine-learning/preprocessing/normalization.md - Imputation missing values: machine-learning/preprocessing/imputation-missing-values.md - Feature Extraction: - Token Count Vectorizer: machine-learning/feature-extraction/token-count-vectorizer.md + - Tf-idf Transformer: machine-learning/feature-extraction/tf-idf-transformer.md - Datasets: - Array Dataset: machine-learning/datasets/array-dataset.md - CSV Dataset: machine-learning/datasets/csv-dataset.md diff --git a/tests/Phpml/FeatureExtraction/TfIdfTransformerTest.php b/tests/Phpml/FeatureExtraction/TfIdfTransformerTest.php index eceaeb3..116b608 100644 --- a/tests/Phpml/FeatureExtraction/TfIdfTransformerTest.php +++ b/tests/Phpml/FeatureExtraction/TfIdfTransformerTest.php @@ -10,7 +10,7 @@ class TfIdfTransformerTest extends \PHPUnit_Framework_TestCase { public function testTfIdfTransformation() { - //https://en.wikipedia.org/wiki/Tf%E2%80%93idf + // https://en.wikipedia.org/wiki/Tf-idf $samples = [ [0 => 1, 1 => 1, 2 => 2, 3 => 1, 4 => 0, 5 => 0],