mirror of
https://github.com/Llewellynvdm/php-ml.git
synced 2025-01-09 16:36:34 +00:00
implement TfIdf transformation
This commit is contained in:
parent
8a65026642
commit
cc50d2c9b1
@ -6,7 +6,7 @@
|
|||||||
[![License](https://poser.pugx.org/php-ai/php-ml/license.svg)](https://packagist.org/packages/php-ai/php-ml)
|
[![License](https://poser.pugx.org/php-ai/php-ml/license.svg)](https://packagist.org/packages/php-ai/php-ml)
|
||||||
[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/quality-score.png?b=develop)](https://scrutinizer-ci.com/g/php-ai/php-ml/?branch=develop)
|
[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/quality-score.png?b=develop)](https://scrutinizer-ci.com/g/php-ai/php-ml/?branch=develop)
|
||||||
|
|
||||||
Fresh approach to Machine Learning in PHP. Note that at the moment PHP is not the best choice for machine learning but maybe this will change ...
|
Fresh approach to Machine Learning in PHP. Algorithms, Cross Validation, Preprocessing, Feature Extraction and much more in one library.
|
||||||
|
|
||||||
Simple example of classification:
|
Simple example of classification:
|
||||||
```php
|
```php
|
||||||
|
54
src/Phpml/FeatureExtraction/TfIdfTransformer.php
Normal file
54
src/Phpml/FeatureExtraction/TfIdfTransformer.php
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
declare (strict_types = 1);
|
||||||
|
|
||||||
|
namespace Phpml\FeatureExtraction;
|
||||||
|
|
||||||
|
class TfIdfTransformer implements Transformer
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* @var array
|
||||||
|
*/
|
||||||
|
private $idf;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param array $samples
|
||||||
|
*
|
||||||
|
* @return array
|
||||||
|
*/
|
||||||
|
public function transform(array $samples): array
|
||||||
|
{
|
||||||
|
$this->countTokensFrequency($samples);
|
||||||
|
|
||||||
|
$count = count($samples);
|
||||||
|
foreach ($this->idf as &$value) {
|
||||||
|
$value = log($count / $value, 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach ($samples as &$sample) {
|
||||||
|
foreach ($sample as $index => &$feature) {
|
||||||
|
$feature = $feature * $this->idf[$index];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $samples;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param array $samples
|
||||||
|
*
|
||||||
|
* @return array
|
||||||
|
*/
|
||||||
|
private function countTokensFrequency(array $samples)
|
||||||
|
{
|
||||||
|
$this->idf = array_fill_keys(array_keys($samples[0]), 0);
|
||||||
|
|
||||||
|
foreach ($samples as $sample) {
|
||||||
|
foreach ($sample as $index => $count) {
|
||||||
|
if ($count > 0) {
|
||||||
|
++$this->idf[$index];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
29
tests/Phpml/FeatureExtraction/TfIdfTransformerTest.php
Normal file
29
tests/Phpml/FeatureExtraction/TfIdfTransformerTest.php
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
declare (strict_types = 1);
|
||||||
|
|
||||||
|
namespace tests\Phpml\FeatureExtraction;
|
||||||
|
|
||||||
|
use Phpml\FeatureExtraction\TfIdfTransformer;
|
||||||
|
|
||||||
|
class TfIdfTransformerTest extends \PHPUnit_Framework_TestCase
|
||||||
|
{
|
||||||
|
public function testTfIdfTransformation()
|
||||||
|
{
|
||||||
|
//https://en.wikipedia.org/wiki/Tf%E2%80%93idf
|
||||||
|
|
||||||
|
$samples = [
|
||||||
|
[0 => 1, 1 => 1, 2 => 2, 3 => 1, 4 => 0, 5 => 0],
|
||||||
|
[0 => 1, 1 => 1, 2 => 0, 3 => 0, 4 => 2, 5 => 3],
|
||||||
|
];
|
||||||
|
|
||||||
|
$tfIdfSamples = [
|
||||||
|
[0 => 0, 1 => 0, 2 => 0.602, 3 => 0.301, 4 => 0, 5 => 0],
|
||||||
|
[0 => 0, 1 => 0, 2 => 0, 3 => 0, 4 => 0.602, 5 => 0.903],
|
||||||
|
];
|
||||||
|
|
||||||
|
$transformer = new TfIdfTransformer();
|
||||||
|
|
||||||
|
$this->assertEquals($tfIdfSamples, $transformer->transform($samples), '', 0.001);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user