From cc50d2c9b154ec7cdf751ca3ac6f622eeee9c8ec Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Wed, 15 Jun 2016 16:04:09 +0200 Subject: [PATCH] implement TfIdf transformation --- README.md | 2 +- .../FeatureExtraction/TfIdfTransformer.php | 54 +++++++++++++++++++ .../TfIdfTransformerTest.php | 29 ++++++++++ 3 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 src/Phpml/FeatureExtraction/TfIdfTransformer.php create mode 100644 tests/Phpml/FeatureExtraction/TfIdfTransformerTest.php diff --git a/README.md b/README.md index db3c32b..c10cb7b 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![License](https://poser.pugx.org/php-ai/php-ml/license.svg)](https://packagist.org/packages/php-ai/php-ml) [![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/quality-score.png?b=develop)](https://scrutinizer-ci.com/g/php-ai/php-ml/?branch=develop) -Fresh approach to Machine Learning in PHP. Note that at the moment PHP is not the best choice for machine learning but maybe this will change ... +Fresh approach to Machine Learning in PHP. Algorithms, Cross Validation, Preprocessing, Feature Extraction and much more in one library. Simple example of classification: ```php diff --git a/src/Phpml/FeatureExtraction/TfIdfTransformer.php b/src/Phpml/FeatureExtraction/TfIdfTransformer.php new file mode 100644 index 0000000..152919e --- /dev/null +++ b/src/Phpml/FeatureExtraction/TfIdfTransformer.php @@ -0,0 +1,54 @@ +countTokensFrequency($samples); + + $count = count($samples); + foreach ($this->idf as &$value) { + $value = log($count / $value, 10); + } + + foreach ($samples as &$sample) { + foreach ($sample as $index => &$feature) { + $feature = $feature * $this->idf[$index]; + } + } + + return $samples; + } + + /** + * @param array $samples + * + * @return array + */ + private function countTokensFrequency(array $samples) + { + $this->idf = array_fill_keys(array_keys($samples[0]), 0); + + foreach ($samples as $sample) { + foreach ($sample as $index => $count) { + if ($count > 0) { + ++$this->idf[$index]; + } + } + } + } +} diff --git a/tests/Phpml/FeatureExtraction/TfIdfTransformerTest.php b/tests/Phpml/FeatureExtraction/TfIdfTransformerTest.php new file mode 100644 index 0000000..59d96c0 --- /dev/null +++ b/tests/Phpml/FeatureExtraction/TfIdfTransformerTest.php @@ -0,0 +1,29 @@ + 1, 1 => 1, 2 => 2, 3 => 1, 4 => 0, 5 => 0], + [0 => 1, 1 => 1, 2 => 0, 3 => 0, 4 => 2, 5 => 3], + ]; + + $tfIdfSamples = [ + [0 => 0, 1 => 0, 2 => 0.602, 3 => 0.301, 4 => 0, 5 => 0], + [0 => 0, 1 => 0, 2 => 0, 3 => 0, 4 => 0.602, 5 => 0.903], + ]; + + $transformer = new TfIdfTransformer(); + + $this->assertEquals($tfIdfSamples, $transformer->transform($samples), '', 0.001); + } +}