2016-06-15 14:04:09 +00:00
|
|
|
<?php
|
|
|
|
|
2016-11-20 21:53:17 +00:00
|
|
|
declare(strict_types=1);
|
2016-06-15 14:04:09 +00:00
|
|
|
|
|
|
|
namespace Phpml\FeatureExtraction;
|
|
|
|
|
2016-06-16 07:00:10 +00:00
|
|
|
use Phpml\Transformer;
|
|
|
|
|
2016-06-15 14:04:09 +00:00
|
|
|
class TfIdfTransformer implements Transformer
|
|
|
|
{
|
|
|
|
/**
|
|
|
|
* @var array
|
|
|
|
*/
|
2017-11-22 21:16:10 +00:00
|
|
|
private $idf = [];
|
2016-06-15 14:04:09 +00:00
|
|
|
|
2018-02-16 06:25:24 +00:00
|
|
|
public function __construct(array $samples = [])
|
2016-06-16 22:08:10 +00:00
|
|
|
{
|
2018-10-28 06:44:52 +00:00
|
|
|
if (count($samples) > 0) {
|
2016-06-16 22:08:10 +00:00
|
|
|
$this->fit($samples);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-02-11 21:10:12 +00:00
|
|
|
public function fit(array $samples, ?array $targets = null): void
|
2016-06-15 14:04:09 +00:00
|
|
|
{
|
|
|
|
$this->countTokensFrequency($samples);
|
|
|
|
|
|
|
|
$count = count($samples);
|
|
|
|
foreach ($this->idf as &$value) {
|
2017-08-17 06:50:37 +00:00
|
|
|
$value = log((float) ($count / $value), 10.0);
|
2016-06-15 14:04:09 +00:00
|
|
|
}
|
2016-06-16 22:08:10 +00:00
|
|
|
}
|
2016-06-15 14:04:09 +00:00
|
|
|
|
2019-05-12 20:25:17 +00:00
|
|
|
public function transform(array &$samples, ?array &$targets = null): void
|
2016-06-16 22:08:10 +00:00
|
|
|
{
|
2016-06-15 14:04:09 +00:00
|
|
|
foreach ($samples as &$sample) {
|
|
|
|
foreach ($sample as $index => &$feature) {
|
2016-12-12 17:34:20 +00:00
|
|
|
$feature *= $this->idf[$index];
|
2016-06-15 14:04:09 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-11-14 20:21:23 +00:00
|
|
|
private function countTokensFrequency(array $samples): void
|
2016-06-15 14:04:09 +00:00
|
|
|
{
|
|
|
|
$this->idf = array_fill_keys(array_keys($samples[0]), 0);
|
|
|
|
|
|
|
|
foreach ($samples as $sample) {
|
|
|
|
foreach ($sample as $index => $count) {
|
|
|
|
if ($count > 0) {
|
|
|
|
++$this->idf[$index];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|