feature extractions tools - TokenCountVectorizez

This commit is contained in:
Arkadiusz Kondas 2016-05-03 23:28:29 +02:00
parent 00178cdd45
commit c05ce8c542
3 changed files with 251 additions and 0 deletions

View File

@ -0,0 +1,163 @@
<?php
declare (strict_types = 1);
namespace Phpml\FeatureExtraction;
use Phpml\Tokenization\Tokenizer;
class TokenCountVectorizer implements Vectorizer
{
/**
* @var Tokenizer
*/
private $tokenizer;
/**
* @var float
*/
private $minDF;
/**
* @var array
*/
private $vocabulary;
/**
* @var array
*/
private $frequencies;
/**
* @param Tokenizer $tokenizer
* @param float $minDF
*/
public function __construct(Tokenizer $tokenizer, float $minDF = 0)
{
$this->tokenizer = $tokenizer;
$this->minDF = $minDF;
$this->vocabulary = [];
$this->frequencies = [];
}
/**
* @param array $samples
*
* @return array
*/
public function transform(array $samples): array
{
foreach ($samples as $index => $sample) {
$samples[$index] = $this->transformSample($sample);
}
$samples = $this->checkDocumentFrequency($samples);
return $samples;
}
/**
* @return array
*/
public function getVocabulary()
{
return array_flip($this->vocabulary);
}
/**
* @param string $sample
*
* @return array
*/
private function transformSample(string $sample)
{
$counts = [];
$tokens = $this->tokenizer->tokenize($sample);
foreach ($tokens as $token) {
$index = $this->getTokenIndex($token);
$this->updateFrequency($token);
if (!isset($counts[$index])) {
$counts[$index] = 0;
}
++$counts[$index];
}
return $counts;
}
/**
* @param string $token
*
* @return mixed
*/
private function getTokenIndex(string $token)
{
if (!isset($this->vocabulary[$token])) {
$this->vocabulary[$token] = count($this->vocabulary);
}
return $this->vocabulary[$token];
}
/**
* @param string $token
*/
private function updateFrequency(string $token)
{
if (!isset($this->frequencies[$token])) {
$this->frequencies[$token] = 0;
}
++$this->frequencies[$token];
}
/**
* @param array $samples
*
* @return array
*/
private function checkDocumentFrequency(array $samples)
{
if ($this->minDF > 0) {
$beyondMinimum = $this->getBeyondMinimumIndexes(count($samples));
foreach ($samples as $index => $sample) {
$samples[$index] = $this->unsetBeyondMinimum($sample, $beyondMinimum);
}
}
return $samples;
}
/**
* @param array $sample
* @param array $beyondMinimum
*
* @return array
*/
private function unsetBeyondMinimum(array $sample, array $beyondMinimum)
{
foreach ($beyondMinimum as $index) {
unset($sample[$index]);
}
return $sample;
}
/**
* @param int $samplesCount
*
* @return array
*/
private function getBeyondMinimumIndexes(int $samplesCount)
{
$indexes = [];
foreach ($this->frequencies as $token => $frequency) {
if (($frequency / $samplesCount) < $this->minDF) {
$indexes[] = $this->getTokenIndex($token);
}
}
return $indexes;
}
}

View File

@ -0,0 +1,15 @@
<?php
declare (strict_types = 1);
namespace Phpml\FeatureExtraction;
interface Vectorizer
{
/**
* @param array $samples
*
* @return array
*/
public function transform(array $samples): array;
}

View File

@ -0,0 +1,73 @@
<?php
declare (strict_types = 1);
namespace tests\Phpml\FeatureExtraction;
use Phpml\FeatureExtraction\TokenCountVectorizer;
use Phpml\Tokenization\WhitespaceTokenizer;
class TokenCountVectorizerTest extends \PHPUnit_Framework_TestCase
{
public function testTokenCountVectorizerWithWhitespaceTokenizer()
{
$samples = [
'Lorem ipsum dolor sit amet dolor',
'Mauris placerat ipsum dolor',
'Mauris diam eros fringilla diam',
];
$vocabulary = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', 'Mauris', 'placerat', 'diam', 'eros', 'fringilla'];
$vector = [
[0 => 1, 1 => 1, 2 => 2, 3 => 1, 4 => 1],
[5 => 1, 6 => 1, 1 => 1, 2 => 1],
[5 => 1, 7 => 2, 8 => 1, 9 => 1],
];
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer());
$this->assertEquals($vector, $vectorizer->transform($samples));
$this->assertEquals($vocabulary, $vectorizer->getVocabulary());
}
public function testMinimumDocumentTokenCountFrequency()
{
// word at least in half samples
$samples = [
'Lorem ipsum dolor sit amet',
'Lorem ipsum sit amet',
'ipsum sit amet',
'ipsum sit amet',
];
$vocabulary = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet'];
$vector = [
[0 => 1, 1 => 1, 3 => 1, 4 => 1],
[0 => 1, 1 => 1, 3 => 1, 4 => 1],
[1 => 1, 3 => 1, 4 => 1],
[1 => 1, 3 => 1, 4 => 1],
];
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), 0.5);
$this->assertEquals($vector, $vectorizer->transform($samples));
$this->assertEquals($vocabulary, $vectorizer->getVocabulary());
// word at least in all samples
$samples = [
'Lorem ipsum dolor sit amet',
'Morbi quis lacinia arcu. Sed eu sagittis Lorem',
'Suspendisse gravida consequat eros Lorem',
];
$vector = [
[0 => 1],
[0 => 1],
[0 => 1],
];
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), 1);
$this->assertEquals($vector, $vectorizer->transform($samples));
}
}