diff --git a/CHANGELOG.md b/CHANGELOG.md index 81ca897..bc66065 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ This changelog references the relevant changes done in PHP-ML library. * 0.1.1 (2016-07-12) * feature [Cross Validation] Stratified Random Split - equal distribution for targets in split - * feature [General] Documentation - add missing pages and fix links + * feature [General] Documentation - add missing pages (Pipeline, ConfusionMatrix and TfIdfTransformer) and fix links * 0.1.0 (2016-07-08) * first develop release diff --git a/README.md b/README.md index 9602e4d..61d215a 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,7 @@ composer require php-ai/php-ml * [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values/) * Feature Extraction * [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/) + * [Tf-idf Transformer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/tf-idf-transformer/) * Datasets * [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/) * Ready to use: diff --git a/docs/index.md b/docs/index.md index 2d5589f..c3088e3 100644 --- a/docs/index.md +++ b/docs/index.md @@ -59,6 +59,7 @@ composer require php-ai/php-ml * [Imputation missing values](machine-learning/preprocessing/imputation-missing-values/) * Feature Extraction * [Token Count Vectorizer](machine-learning/feature-extraction/token-count-vectorizer/) + * [Tf-idf Transformer](machine-learning/feature-extraction/tf-idf-transformer/) * Datasets * [CSV](machine-learning/datasets/csv-dataset/) * Ready to use: diff --git a/docs/machine-learning/feature-extraction/tf-idf-transformer.md b/docs/machine-learning/feature-extraction/tf-idf-transformer.md new file mode 100644 index 0000000..c592b8d --- /dev/null +++ b/docs/machine-learning/feature-extraction/tf-idf-transformer.md @@ -0,0 +1,42 @@ +# Tf-idf Transformer + +Tf–idf, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. + +### Constructor Parameters + +* $samples (array) - samples for fit tf-idf model + +``` +use Phpml\FeatureExtraction\TfIdfTransformer; + +$samples = [ + [1, 2, 4], + [0, 2, 1] +]; + +$transformer = new TfIdfTransformer($samples); +``` + +### Transformation + +To transform a collection of text samples use `transform` method. Example: + +``` +use Phpml\FeatureExtraction\TfIdfTransformer; + +$samples = [ + [0 => 1, 1 => 1, 2 => 2, 3 => 1, 4 => 0, 5 => 0], + [0 => 1, 1 => 1, 2 => 0, 3 => 0, 4 => 2, 5 => 3], +]; + +$transformer = new TfIdfTransformer($samples); +$transformer->transform($samples); + +/* +$samples = [ + [0 => 0, 1 => 0, 2 => 0.602, 3 => 0.301, 4 => 0, 5 => 0], + [0 => 0, 1 => 0, 2 => 0, 3 => 0, 4 => 0.602, 5 => 0.903], +]; +*/ + +``` diff --git a/mkdocs.yml b/mkdocs.yml index 71ad898..2634101 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -25,6 +25,7 @@ pages: - Imputation missing values: machine-learning/preprocessing/imputation-missing-values.md - Feature Extraction: - Token Count Vectorizer: machine-learning/feature-extraction/token-count-vectorizer.md + - Tf-idf Transformer: machine-learning/feature-extraction/tf-idf-transformer.md - Datasets: - Array Dataset: machine-learning/datasets/array-dataset.md - CSV Dataset: machine-learning/datasets/csv-dataset.md diff --git a/tests/Phpml/FeatureExtraction/TfIdfTransformerTest.php b/tests/Phpml/FeatureExtraction/TfIdfTransformerTest.php index eceaeb3..116b608 100644 --- a/tests/Phpml/FeatureExtraction/TfIdfTransformerTest.php +++ b/tests/Phpml/FeatureExtraction/TfIdfTransformerTest.php @@ -10,7 +10,7 @@ class TfIdfTransformerTest extends \PHPUnit_Framework_TestCase { public function testTfIdfTransformation() { - //https://en.wikipedia.org/wiki/Tf%E2%80%93idf + // https://en.wikipedia.org/wiki/Tf-idf $samples = [ [0 => 1, 1 => 1, 2 => 2, 3 => 1, 4 => 0, 5 => 0],