2016-05-03 21:28:29 +00:00
|
|
|
<?php
|
|
|
|
|
2016-11-20 21:53:17 +00:00
|
|
|
declare(strict_types=1);
|
2016-05-03 21:28:29 +00:00
|
|
|
|
|
|
|
namespace Phpml\FeatureExtraction;
|
|
|
|
|
|
|
|
use Phpml\Tokenization\Tokenizer;
|
2016-06-16 07:00:10 +00:00
|
|
|
use Phpml\Transformer;
|
2016-05-03 21:28:29 +00:00
|
|
|
|
2016-06-15 12:09:49 +00:00
|
|
|
class TokenCountVectorizer implements Transformer
|
2016-05-03 21:28:29 +00:00
|
|
|
{
|
|
|
|
/**
|
|
|
|
* @var Tokenizer
|
|
|
|
*/
|
|
|
|
private $tokenizer;
|
|
|
|
|
2016-07-06 21:22:29 +00:00
|
|
|
/**
|
2018-01-06 12:09:33 +00:00
|
|
|
* @var StopWords|null
|
2016-07-06 21:22:29 +00:00
|
|
|
*/
|
|
|
|
private $stopWords;
|
|
|
|
|
2016-05-03 21:28:29 +00:00
|
|
|
/**
|
|
|
|
* @var float
|
|
|
|
*/
|
|
|
|
private $minDF;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @var array
|
|
|
|
*/
|
2017-11-22 21:16:10 +00:00
|
|
|
private $vocabulary = [];
|
2016-05-03 21:28:29 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @var array
|
|
|
|
*/
|
2017-11-22 21:16:10 +00:00
|
|
|
private $frequencies = [];
|
2016-05-03 21:28:29 +00:00
|
|
|
|
2017-11-14 20:21:23 +00:00
|
|
|
public function __construct(Tokenizer $tokenizer, ?StopWords $stopWords = null, float $minDF = 0.0)
|
2016-05-03 21:28:29 +00:00
|
|
|
{
|
|
|
|
$this->tokenizer = $tokenizer;
|
2016-07-06 21:22:29 +00:00
|
|
|
$this->stopWords = $stopWords;
|
2016-05-03 21:28:29 +00:00
|
|
|
$this->minDF = $minDF;
|
|
|
|
}
|
|
|
|
|
2018-02-11 21:10:12 +00:00
|
|
|
public function fit(array $samples, ?array $targets = null): void
|
2016-06-16 22:08:10 +00:00
|
|
|
{
|
2016-06-16 22:33:48 +00:00
|
|
|
$this->buildVocabulary($samples);
|
2016-06-16 22:08:10 +00:00
|
|
|
}
|
|
|
|
|
2019-05-12 20:25:17 +00:00
|
|
|
public function transform(array &$samples, ?array &$targets = null): void
|
2016-05-03 21:28:29 +00:00
|
|
|
{
|
2018-10-16 19:42:06 +00:00
|
|
|
array_walk($samples, function (string &$sample): void {
|
2016-06-16 22:33:48 +00:00
|
|
|
$this->transformSample($sample);
|
2018-10-16 19:42:06 +00:00
|
|
|
});
|
2016-05-03 21:28:29 +00:00
|
|
|
|
2016-06-16 22:33:48 +00:00
|
|
|
$this->checkDocumentFrequency($samples);
|
2016-05-03 21:28:29 +00:00
|
|
|
}
|
|
|
|
|
2017-11-22 21:16:10 +00:00
|
|
|
public function getVocabulary(): array
|
2016-05-03 21:28:29 +00:00
|
|
|
{
|
|
|
|
return array_flip($this->vocabulary);
|
|
|
|
}
|
|
|
|
|
2017-11-14 20:21:23 +00:00
|
|
|
private function buildVocabulary(array &$samples): void
|
2016-06-14 07:58:11 +00:00
|
|
|
{
|
2018-10-16 19:42:06 +00:00
|
|
|
foreach ($samples as $sample) {
|
2016-06-14 07:58:11 +00:00
|
|
|
$tokens = $this->tokenizer->tokenize($sample);
|
|
|
|
foreach ($tokens as $token) {
|
|
|
|
$this->addTokenToVocabulary($token);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-11-14 20:21:23 +00:00
|
|
|
private function transformSample(string &$sample): void
|
2016-05-03 21:28:29 +00:00
|
|
|
{
|
|
|
|
$counts = [];
|
2016-06-16 22:33:48 +00:00
|
|
|
$tokens = $this->tokenizer->tokenize($sample);
|
2016-06-14 07:58:11 +00:00
|
|
|
|
2016-05-03 21:28:29 +00:00
|
|
|
foreach ($tokens as $token) {
|
|
|
|
$index = $this->getTokenIndex($token);
|
2017-11-22 21:16:10 +00:00
|
|
|
if ($index !== false) {
|
2016-06-16 22:33:48 +00:00
|
|
|
$this->updateFrequency($token);
|
|
|
|
if (!isset($counts[$index])) {
|
|
|
|
$counts[$index] = 0;
|
|
|
|
}
|
2016-05-03 21:28:29 +00:00
|
|
|
|
2016-06-16 22:33:48 +00:00
|
|
|
++$counts[$index];
|
|
|
|
}
|
2016-05-03 21:28:29 +00:00
|
|
|
}
|
|
|
|
|
2016-06-14 07:58:11 +00:00
|
|
|
foreach ($this->vocabulary as $index) {
|
|
|
|
if (!isset($counts[$index])) {
|
|
|
|
$counts[$index] = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-14 11:25:11 +00:00
|
|
|
ksort($counts);
|
|
|
|
|
2016-06-16 22:33:48 +00:00
|
|
|
$sample = $counts;
|
2016-05-03 21:28:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2016-06-16 22:33:48 +00:00
|
|
|
* @return int|bool
|
2016-06-14 07:58:11 +00:00
|
|
|
*/
|
2016-06-16 22:33:48 +00:00
|
|
|
private function getTokenIndex(string $token)
|
2016-06-14 07:58:11 +00:00
|
|
|
{
|
2016-07-06 21:22:29 +00:00
|
|
|
if ($this->isStopWord($token)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-12-07 23:45:42 +00:00
|
|
|
return $this->vocabulary[$token] ?? false;
|
2016-06-14 07:58:11 +00:00
|
|
|
}
|
|
|
|
|
2017-11-14 20:21:23 +00:00
|
|
|
private function addTokenToVocabulary(string $token): void
|
2016-05-03 21:28:29 +00:00
|
|
|
{
|
2016-07-06 21:22:29 +00:00
|
|
|
if ($this->isStopWord($token)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2016-05-03 21:28:29 +00:00
|
|
|
if (!isset($this->vocabulary[$token])) {
|
|
|
|
$this->vocabulary[$token] = count($this->vocabulary);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-06 21:22:29 +00:00
|
|
|
private function isStopWord(string $token): bool
|
|
|
|
{
|
2018-02-16 06:25:24 +00:00
|
|
|
return $this->stopWords !== null && $this->stopWords->isStopWord($token);
|
2016-07-06 21:22:29 +00:00
|
|
|
}
|
|
|
|
|
2017-11-14 20:21:23 +00:00
|
|
|
private function updateFrequency(string $token): void
|
2016-05-03 21:28:29 +00:00
|
|
|
{
|
|
|
|
if (!isset($this->frequencies[$token])) {
|
|
|
|
$this->frequencies[$token] = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
++$this->frequencies[$token];
|
|
|
|
}
|
|
|
|
|
2017-11-14 20:21:23 +00:00
|
|
|
private function checkDocumentFrequency(array &$samples): void
|
2016-05-03 21:28:29 +00:00
|
|
|
{
|
|
|
|
if ($this->minDF > 0) {
|
|
|
|
$beyondMinimum = $this->getBeyondMinimumIndexes(count($samples));
|
2016-06-16 22:33:48 +00:00
|
|
|
foreach ($samples as &$sample) {
|
|
|
|
$this->resetBeyondMinimum($sample, $beyondMinimum);
|
2016-05-03 21:28:29 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-11-14 20:21:23 +00:00
|
|
|
private function resetBeyondMinimum(array &$sample, array $beyondMinimum): void
|
2016-05-03 21:28:29 +00:00
|
|
|
{
|
|
|
|
foreach ($beyondMinimum as $index) {
|
2016-06-14 07:58:11 +00:00
|
|
|
$sample[$index] = 0;
|
2016-05-03 21:28:29 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-11-22 21:16:10 +00:00
|
|
|
private function getBeyondMinimumIndexes(int $samplesCount): array
|
2016-05-03 21:28:29 +00:00
|
|
|
{
|
|
|
|
$indexes = [];
|
|
|
|
foreach ($this->frequencies as $token => $frequency) {
|
|
|
|
if (($frequency / $samplesCount) < $this->minDF) {
|
2019-03-20 22:22:45 +00:00
|
|
|
$indexes[] = $this->getTokenIndex((string) $token);
|
2016-05-03 21:28:29 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return $indexes;
|
|
|
|
}
|
|
|
|
}
|