implement StopWords in TokenCountVectorizer

This commit is contained in:
Arkadiusz Kondas 2016-07-06 23:22:29 +02:00
parent a2aa27adba
commit cce68997a1
2 changed files with 70 additions and 5 deletions

View File

@ -14,6 +14,11 @@ class TokenCountVectorizer implements Transformer
*/ */
private $tokenizer; private $tokenizer;
/**
* @var StopWords
*/
private $stopWords;
/** /**
* @var float * @var float
*/ */
@ -31,12 +36,15 @@ class TokenCountVectorizer implements Transformer
/** /**
* @param Tokenizer $tokenizer * @param Tokenizer $tokenizer
* @param StopWords $stopWords
* @param float $minDF * @param float $minDF
*/ */
public function __construct(Tokenizer $tokenizer, float $minDF = 0) public function __construct(Tokenizer $tokenizer, StopWords $stopWords = null, float $minDF = 0)
{ {
$this->tokenizer = $tokenizer; $this->tokenizer = $tokenizer;
$this->stopWords = $stopWords;
$this->minDF = $minDF; $this->minDF = $minDF;
$this->vocabulary = []; $this->vocabulary = [];
$this->frequencies = []; $this->frequencies = [];
} }
@ -118,6 +126,10 @@ class TokenCountVectorizer implements Transformer
*/ */
private function getTokenIndex(string $token) private function getTokenIndex(string $token)
{ {
if ($this->isStopWord($token)) {
return false;
}
return isset($this->vocabulary[$token]) ? $this->vocabulary[$token] : false; return isset($this->vocabulary[$token]) ? $this->vocabulary[$token] : false;
} }
@ -126,11 +138,25 @@ class TokenCountVectorizer implements Transformer
*/ */
private function addTokenToVocabulary(string $token) private function addTokenToVocabulary(string $token)
{ {
if ($this->isStopWord($token)) {
return;
}
if (!isset($this->vocabulary[$token])) { if (!isset($this->vocabulary[$token])) {
$this->vocabulary[$token] = count($this->vocabulary); $this->vocabulary[$token] = count($this->vocabulary);
} }
} }
/**
* @param string $token
*
* @return bool
*/
private function isStopWord(string $token): bool
{
return $this->stopWords && $this->stopWords->isStopWord($token);
}
/** /**
* @param string $token * @param string $token
*/ */

View File

@ -4,12 +4,13 @@ declare (strict_types = 1);
namespace tests\Phpml\FeatureExtraction; namespace tests\Phpml\FeatureExtraction;
use Phpml\FeatureExtraction\StopWords;
use Phpml\FeatureExtraction\TokenCountVectorizer; use Phpml\FeatureExtraction\TokenCountVectorizer;
use Phpml\Tokenization\WhitespaceTokenizer; use Phpml\Tokenization\WhitespaceTokenizer;
class TokenCountVectorizerTest extends \PHPUnit_Framework_TestCase class TokenCountVectorizerTest extends \PHPUnit_Framework_TestCase
{ {
public function testTokenCountVectorizerWithWhitespaceTokenizer() public function testTransformationWithWhitespaceTokenizer()
{ {
$samples = [ $samples = [
'Lorem ipsum dolor sit amet dolor', 'Lorem ipsum dolor sit amet dolor',
@ -45,7 +46,7 @@ class TokenCountVectorizerTest extends \PHPUnit_Framework_TestCase
$this->assertEquals($tokensCounts, $samples); $this->assertEquals($tokensCounts, $samples);
} }
public function testMinimumDocumentTokenCountFrequency() public function testTransformationWithMinimumDocumentTokenCountFrequency()
{ {
// word at least in half samples // word at least in half samples
$samples = [ $samples = [
@ -70,7 +71,7 @@ class TokenCountVectorizerTest extends \PHPUnit_Framework_TestCase
[0 => 0, 1 => 1, 2 => 0, 3 => 1, 4 => 1], [0 => 0, 1 => 1, 2 => 0, 3 => 1, 4 => 1],
]; ];
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), 0.5); $vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), null, 0.5);
$vectorizer->fit($samples); $vectorizer->fit($samples);
$this->assertEquals($vocabulary, $vectorizer->getVocabulary()); $this->assertEquals($vocabulary, $vectorizer->getVocabulary());
@ -91,10 +92,48 @@ class TokenCountVectorizerTest extends \PHPUnit_Framework_TestCase
[0 => 1, 1 => 0, 2 => 0, 3 => 0, 4 => 0, 5 => 0, 6 => 0, 7 => 0, 8 => 0], [0 => 1, 1 => 0, 2 => 0, 3 => 0, 4 => 0, 5 => 0, 6 => 0, 7 => 0, 8 => 0],
]; ];
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), 1); $vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), null, 1);
$vectorizer->fit($samples); $vectorizer->fit($samples);
$vectorizer->transform($samples); $vectorizer->transform($samples);
$this->assertEquals($tokensCounts, $samples); $this->assertEquals($tokensCounts, $samples);
} }
public function testTransformationWithStopWords()
{
$samples = [
'Lorem ipsum dolor sit amet dolor',
'Mauris placerat ipsum dolor',
'Mauris diam eros fringilla diam',
];
$stopWords = new StopWords(['dolor', 'diam']);
$vocabulary = [
0 => 'Lorem',
1 => 'ipsum',
//2 => 'dolor',
2 => 'sit',
3 => 'amet',
4 => 'Mauris',
5 => 'placerat',
//7 => 'diam',
6 => 'eros',
7 => 'fringilla',
];
$tokensCounts = [
[0 => 1, 1 => 1, 2 => 1, 3 => 1, 4 => 0, 5 => 0, 6 => 0, 7 => 0],
[0 => 0, 1 => 1, 2 => 0, 3 => 0, 4 => 1, 5 => 1, 6 => 0, 7 => 0],
[0 => 0, 1 => 0, 2 => 0, 3 => 0, 4 => 1, 5 => 0, 6 => 1, 7 => 1],
];
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), $stopWords);
$vectorizer->fit($samples);
$this->assertEquals($vocabulary, $vectorizer->getVocabulary());
$vectorizer->transform($samples);
$this->assertEquals($tokensCounts, $samples);
}
} }