php-ml/src/FeatureExtraction/TokenCountVectorizer.php

167 lines
3.8 KiB
PHP

<?php
declare(strict_types=1);
namespace Phpml\FeatureExtraction;
use Phpml\Tokenization\Tokenizer;
use Phpml\Transformer;
class TokenCountVectorizer implements Transformer
{
/**
* @var Tokenizer
*/
private $tokenizer;
/**
* @var StopWords|null
*/
private $stopWords;
/**
* @var float
*/
private $minDF;
/**
* @var array
*/
private $vocabulary = [];
/**
* @var array
*/
private $frequencies = [];
public function __construct(Tokenizer $tokenizer, ?StopWords $stopWords = null, float $minDF = 0.0)
{
$this->tokenizer = $tokenizer;
$this->stopWords = $stopWords;
$this->minDF = $minDF;
}
public function fit(array $samples, ?array $targets = null): void
{
$this->buildVocabulary($samples);
}
public function transform(array &$samples, ?array &$targets = null): void
{
array_walk($samples, function (string &$sample): void {
$this->transformSample($sample);
});
$this->checkDocumentFrequency($samples);
}
public function getVocabulary(): array
{
return array_flip($this->vocabulary);
}
private function buildVocabulary(array &$samples): void
{
foreach ($samples as $sample) {
$tokens = $this->tokenizer->tokenize($sample);
foreach ($tokens as $token) {
$this->addTokenToVocabulary($token);
}
}
}
private function transformSample(string &$sample): void
{
$counts = [];
$tokens = $this->tokenizer->tokenize($sample);
foreach ($tokens as $token) {
$index = $this->getTokenIndex($token);
if ($index !== false) {
$this->updateFrequency($token);
if (!isset($counts[$index])) {
$counts[$index] = 0;
}
++$counts[$index];
}
}
foreach ($this->vocabulary as $index) {
if (!isset($counts[$index])) {
$counts[$index] = 0;
}
}
ksort($counts);
$sample = $counts;
}
/**
* @return int|bool
*/
private function getTokenIndex(string $token)
{
if ($this->isStopWord($token)) {
return false;
}
return $this->vocabulary[$token] ?? false;
}
private function addTokenToVocabulary(string $token): void
{
if ($this->isStopWord($token)) {
return;
}
if (!isset($this->vocabulary[$token])) {
$this->vocabulary[$token] = count($this->vocabulary);
}
}
private function isStopWord(string $token): bool
{
return $this->stopWords !== null && $this->stopWords->isStopWord($token);
}
private function updateFrequency(string $token): void
{
if (!isset($this->frequencies[$token])) {
$this->frequencies[$token] = 0;
}
++$this->frequencies[$token];
}
private function checkDocumentFrequency(array &$samples): void
{
if ($this->minDF > 0) {
$beyondMinimum = $this->getBeyondMinimumIndexes(count($samples));
foreach ($samples as &$sample) {
$this->resetBeyondMinimum($sample, $beyondMinimum);
}
}
}
private function resetBeyondMinimum(array &$sample, array $beyondMinimum): void
{
foreach ($beyondMinimum as $index) {
$sample[$index] = 0;
}
}
private function getBeyondMinimumIndexes(int $samplesCount): array
{
$indexes = [];
foreach ($this->frequencies as $token => $frequency) {
if (($frequency / $samplesCount) < $this->minDF) {
$indexes[] = $this->getTokenIndex((string) $token);
}
}
return $indexes;
}
}