mirror of
https://github.com/Llewellynvdm/php-ml.git
synced 2025-01-09 16:36:34 +00:00
feature extractions tools - TokenCountVectorizez
This commit is contained in:
parent
00178cdd45
commit
c05ce8c542
163
src/Phpml/FeatureExtraction/TokenCountVectorizer.php
Normal file
163
src/Phpml/FeatureExtraction/TokenCountVectorizer.php
Normal file
@ -0,0 +1,163 @@
|
||||
<?php
|
||||
|
||||
declare (strict_types = 1);
|
||||
|
||||
namespace Phpml\FeatureExtraction;
|
||||
|
||||
use Phpml\Tokenization\Tokenizer;
|
||||
|
||||
class TokenCountVectorizer implements Vectorizer
|
||||
{
|
||||
/**
|
||||
* @var Tokenizer
|
||||
*/
|
||||
private $tokenizer;
|
||||
|
||||
/**
|
||||
* @var float
|
||||
*/
|
||||
private $minDF;
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $vocabulary;
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $frequencies;
|
||||
|
||||
/**
|
||||
* @param Tokenizer $tokenizer
|
||||
* @param float $minDF
|
||||
*/
|
||||
public function __construct(Tokenizer $tokenizer, float $minDF = 0)
|
||||
{
|
||||
$this->tokenizer = $tokenizer;
|
||||
$this->minDF = $minDF;
|
||||
$this->vocabulary = [];
|
||||
$this->frequencies = [];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function transform(array $samples): array
|
||||
{
|
||||
foreach ($samples as $index => $sample) {
|
||||
$samples[$index] = $this->transformSample($sample);
|
||||
}
|
||||
|
||||
$samples = $this->checkDocumentFrequency($samples);
|
||||
|
||||
return $samples;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array
|
||||
*/
|
||||
public function getVocabulary()
|
||||
{
|
||||
return array_flip($this->vocabulary);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $sample
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
private function transformSample(string $sample)
|
||||
{
|
||||
$counts = [];
|
||||
$tokens = $this->tokenizer->tokenize($sample);
|
||||
foreach ($tokens as $token) {
|
||||
$index = $this->getTokenIndex($token);
|
||||
$this->updateFrequency($token);
|
||||
if (!isset($counts[$index])) {
|
||||
$counts[$index] = 0;
|
||||
}
|
||||
|
||||
++$counts[$index];
|
||||
}
|
||||
|
||||
return $counts;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $token
|
||||
*
|
||||
* @return mixed
|
||||
*/
|
||||
private function getTokenIndex(string $token)
|
||||
{
|
||||
if (!isset($this->vocabulary[$token])) {
|
||||
$this->vocabulary[$token] = count($this->vocabulary);
|
||||
}
|
||||
|
||||
return $this->vocabulary[$token];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $token
|
||||
*/
|
||||
private function updateFrequency(string $token)
|
||||
{
|
||||
if (!isset($this->frequencies[$token])) {
|
||||
$this->frequencies[$token] = 0;
|
||||
}
|
||||
|
||||
++$this->frequencies[$token];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $samples
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
private function checkDocumentFrequency(array $samples)
|
||||
{
|
||||
if ($this->minDF > 0) {
|
||||
$beyondMinimum = $this->getBeyondMinimumIndexes(count($samples));
|
||||
foreach ($samples as $index => $sample) {
|
||||
$samples[$index] = $this->unsetBeyondMinimum($sample, $beyondMinimum);
|
||||
}
|
||||
}
|
||||
|
||||
return $samples;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array $sample
|
||||
* @param array $beyondMinimum
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
private function unsetBeyondMinimum(array $sample, array $beyondMinimum)
|
||||
{
|
||||
foreach ($beyondMinimum as $index) {
|
||||
unset($sample[$index]);
|
||||
}
|
||||
|
||||
return $sample;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param int $samplesCount
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
private function getBeyondMinimumIndexes(int $samplesCount)
|
||||
{
|
||||
$indexes = [];
|
||||
foreach ($this->frequencies as $token => $frequency) {
|
||||
if (($frequency / $samplesCount) < $this->minDF) {
|
||||
$indexes[] = $this->getTokenIndex($token);
|
||||
}
|
||||
}
|
||||
|
||||
return $indexes;
|
||||
}
|
||||
}
|
15
src/Phpml/FeatureExtraction/Vectorizer.php
Normal file
15
src/Phpml/FeatureExtraction/Vectorizer.php
Normal file
@ -0,0 +1,15 @@
|
||||
<?php
|
||||
|
||||
declare (strict_types = 1);
|
||||
|
||||
namespace Phpml\FeatureExtraction;
|
||||
|
||||
interface Vectorizer
|
||||
{
|
||||
/**
|
||||
* @param array $samples
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function transform(array $samples): array;
|
||||
}
|
73
tests/Phpml/FeatureExtraction/TokenCountVectorizerTest.php
Normal file
73
tests/Phpml/FeatureExtraction/TokenCountVectorizerTest.php
Normal file
@ -0,0 +1,73 @@
|
||||
<?php
|
||||
|
||||
declare (strict_types = 1);
|
||||
|
||||
namespace tests\Phpml\FeatureExtraction;
|
||||
|
||||
use Phpml\FeatureExtraction\TokenCountVectorizer;
|
||||
use Phpml\Tokenization\WhitespaceTokenizer;
|
||||
|
||||
class TokenCountVectorizerTest extends \PHPUnit_Framework_TestCase
|
||||
{
|
||||
public function testTokenCountVectorizerWithWhitespaceTokenizer()
|
||||
{
|
||||
$samples = [
|
||||
'Lorem ipsum dolor sit amet dolor',
|
||||
'Mauris placerat ipsum dolor',
|
||||
'Mauris diam eros fringilla diam',
|
||||
];
|
||||
|
||||
$vocabulary = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', 'Mauris', 'placerat', 'diam', 'eros', 'fringilla'];
|
||||
$vector = [
|
||||
[0 => 1, 1 => 1, 2 => 2, 3 => 1, 4 => 1],
|
||||
[5 => 1, 6 => 1, 1 => 1, 2 => 1],
|
||||
[5 => 1, 7 => 2, 8 => 1, 9 => 1],
|
||||
];
|
||||
|
||||
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer());
|
||||
|
||||
$this->assertEquals($vector, $vectorizer->transform($samples));
|
||||
$this->assertEquals($vocabulary, $vectorizer->getVocabulary());
|
||||
}
|
||||
|
||||
public function testMinimumDocumentTokenCountFrequency()
|
||||
{
|
||||
// word at least in half samples
|
||||
$samples = [
|
||||
'Lorem ipsum dolor sit amet',
|
||||
'Lorem ipsum sit amet',
|
||||
'ipsum sit amet',
|
||||
'ipsum sit amet',
|
||||
];
|
||||
|
||||
$vocabulary = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet'];
|
||||
$vector = [
|
||||
[0 => 1, 1 => 1, 3 => 1, 4 => 1],
|
||||
[0 => 1, 1 => 1, 3 => 1, 4 => 1],
|
||||
[1 => 1, 3 => 1, 4 => 1],
|
||||
[1 => 1, 3 => 1, 4 => 1],
|
||||
];
|
||||
|
||||
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), 0.5);
|
||||
|
||||
$this->assertEquals($vector, $vectorizer->transform($samples));
|
||||
$this->assertEquals($vocabulary, $vectorizer->getVocabulary());
|
||||
|
||||
// word at least in all samples
|
||||
$samples = [
|
||||
'Lorem ipsum dolor sit amet',
|
||||
'Morbi quis lacinia arcu. Sed eu sagittis Lorem',
|
||||
'Suspendisse gravida consequat eros Lorem',
|
||||
];
|
||||
|
||||
$vector = [
|
||||
[0 => 1],
|
||||
[0 => 1],
|
||||
[0 => 1],
|
||||
];
|
||||
|
||||
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), 1);
|
||||
|
||||
$this->assertEquals($vector, $vectorizer->transform($samples));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user