mirror of
https://github.com/Llewellynvdm/php-ml.git
synced 2025-02-13 17:38:36 +00:00
implement StopWords in TokenCountVectorizer
This commit is contained in:
parent
a2aa27adba
commit
cce68997a1
@ -14,6 +14,11 @@ class TokenCountVectorizer implements Transformer
|
|||||||
*/
|
*/
|
||||||
private $tokenizer;
|
private $tokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @var StopWords
|
||||||
|
*/
|
||||||
|
private $stopWords;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @var float
|
* @var float
|
||||||
*/
|
*/
|
||||||
@ -31,12 +36,15 @@ class TokenCountVectorizer implements Transformer
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* @param Tokenizer $tokenizer
|
* @param Tokenizer $tokenizer
|
||||||
|
* @param StopWords $stopWords
|
||||||
* @param float $minDF
|
* @param float $minDF
|
||||||
*/
|
*/
|
||||||
public function __construct(Tokenizer $tokenizer, float $minDF = 0)
|
public function __construct(Tokenizer $tokenizer, StopWords $stopWords = null, float $minDF = 0)
|
||||||
{
|
{
|
||||||
$this->tokenizer = $tokenizer;
|
$this->tokenizer = $tokenizer;
|
||||||
|
$this->stopWords = $stopWords;
|
||||||
$this->minDF = $minDF;
|
$this->minDF = $minDF;
|
||||||
|
|
||||||
$this->vocabulary = [];
|
$this->vocabulary = [];
|
||||||
$this->frequencies = [];
|
$this->frequencies = [];
|
||||||
}
|
}
|
||||||
@ -118,6 +126,10 @@ class TokenCountVectorizer implements Transformer
|
|||||||
*/
|
*/
|
||||||
private function getTokenIndex(string $token)
|
private function getTokenIndex(string $token)
|
||||||
{
|
{
|
||||||
|
if ($this->isStopWord($token)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return isset($this->vocabulary[$token]) ? $this->vocabulary[$token] : false;
|
return isset($this->vocabulary[$token]) ? $this->vocabulary[$token] : false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -126,11 +138,25 @@ class TokenCountVectorizer implements Transformer
|
|||||||
*/
|
*/
|
||||||
private function addTokenToVocabulary(string $token)
|
private function addTokenToVocabulary(string $token)
|
||||||
{
|
{
|
||||||
|
if ($this->isStopWord($token)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (!isset($this->vocabulary[$token])) {
|
if (!isset($this->vocabulary[$token])) {
|
||||||
$this->vocabulary[$token] = count($this->vocabulary);
|
$this->vocabulary[$token] = count($this->vocabulary);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param string $token
|
||||||
|
*
|
||||||
|
* @return bool
|
||||||
|
*/
|
||||||
|
private function isStopWord(string $token): bool
|
||||||
|
{
|
||||||
|
return $this->stopWords && $this->stopWords->isStopWord($token);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param string $token
|
* @param string $token
|
||||||
*/
|
*/
|
||||||
|
@ -4,12 +4,13 @@ declare (strict_types = 1);
|
|||||||
|
|
||||||
namespace tests\Phpml\FeatureExtraction;
|
namespace tests\Phpml\FeatureExtraction;
|
||||||
|
|
||||||
|
use Phpml\FeatureExtraction\StopWords;
|
||||||
use Phpml\FeatureExtraction\TokenCountVectorizer;
|
use Phpml\FeatureExtraction\TokenCountVectorizer;
|
||||||
use Phpml\Tokenization\WhitespaceTokenizer;
|
use Phpml\Tokenization\WhitespaceTokenizer;
|
||||||
|
|
||||||
class TokenCountVectorizerTest extends \PHPUnit_Framework_TestCase
|
class TokenCountVectorizerTest extends \PHPUnit_Framework_TestCase
|
||||||
{
|
{
|
||||||
public function testTokenCountVectorizerWithWhitespaceTokenizer()
|
public function testTransformationWithWhitespaceTokenizer()
|
||||||
{
|
{
|
||||||
$samples = [
|
$samples = [
|
||||||
'Lorem ipsum dolor sit amet dolor',
|
'Lorem ipsum dolor sit amet dolor',
|
||||||
@ -45,7 +46,7 @@ class TokenCountVectorizerTest extends \PHPUnit_Framework_TestCase
|
|||||||
$this->assertEquals($tokensCounts, $samples);
|
$this->assertEquals($tokensCounts, $samples);
|
||||||
}
|
}
|
||||||
|
|
||||||
public function testMinimumDocumentTokenCountFrequency()
|
public function testTransformationWithMinimumDocumentTokenCountFrequency()
|
||||||
{
|
{
|
||||||
// word at least in half samples
|
// word at least in half samples
|
||||||
$samples = [
|
$samples = [
|
||||||
@ -70,7 +71,7 @@ class TokenCountVectorizerTest extends \PHPUnit_Framework_TestCase
|
|||||||
[0 => 0, 1 => 1, 2 => 0, 3 => 1, 4 => 1],
|
[0 => 0, 1 => 1, 2 => 0, 3 => 1, 4 => 1],
|
||||||
];
|
];
|
||||||
|
|
||||||
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), 0.5);
|
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), null, 0.5);
|
||||||
|
|
||||||
$vectorizer->fit($samples);
|
$vectorizer->fit($samples);
|
||||||
$this->assertEquals($vocabulary, $vectorizer->getVocabulary());
|
$this->assertEquals($vocabulary, $vectorizer->getVocabulary());
|
||||||
@ -91,10 +92,48 @@ class TokenCountVectorizerTest extends \PHPUnit_Framework_TestCase
|
|||||||
[0 => 1, 1 => 0, 2 => 0, 3 => 0, 4 => 0, 5 => 0, 6 => 0, 7 => 0, 8 => 0],
|
[0 => 1, 1 => 0, 2 => 0, 3 => 0, 4 => 0, 5 => 0, 6 => 0, 7 => 0, 8 => 0],
|
||||||
];
|
];
|
||||||
|
|
||||||
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), 1);
|
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), null, 1);
|
||||||
$vectorizer->fit($samples);
|
$vectorizer->fit($samples);
|
||||||
$vectorizer->transform($samples);
|
$vectorizer->transform($samples);
|
||||||
|
|
||||||
$this->assertEquals($tokensCounts, $samples);
|
$this->assertEquals($tokensCounts, $samples);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function testTransformationWithStopWords()
|
||||||
|
{
|
||||||
|
$samples = [
|
||||||
|
'Lorem ipsum dolor sit amet dolor',
|
||||||
|
'Mauris placerat ipsum dolor',
|
||||||
|
'Mauris diam eros fringilla diam',
|
||||||
|
];
|
||||||
|
|
||||||
|
$stopWords = new StopWords(['dolor', 'diam']);
|
||||||
|
|
||||||
|
$vocabulary = [
|
||||||
|
0 => 'Lorem',
|
||||||
|
1 => 'ipsum',
|
||||||
|
//2 => 'dolor',
|
||||||
|
2 => 'sit',
|
||||||
|
3 => 'amet',
|
||||||
|
4 => 'Mauris',
|
||||||
|
5 => 'placerat',
|
||||||
|
//7 => 'diam',
|
||||||
|
6 => 'eros',
|
||||||
|
7 => 'fringilla',
|
||||||
|
];
|
||||||
|
|
||||||
|
$tokensCounts = [
|
||||||
|
[0 => 1, 1 => 1, 2 => 1, 3 => 1, 4 => 0, 5 => 0, 6 => 0, 7 => 0],
|
||||||
|
[0 => 0, 1 => 1, 2 => 0, 3 => 0, 4 => 1, 5 => 1, 6 => 0, 7 => 0],
|
||||||
|
[0 => 0, 1 => 0, 2 => 0, 3 => 0, 4 => 1, 5 => 0, 6 => 1, 7 => 1],
|
||||||
|
];
|
||||||
|
|
||||||
|
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), $stopWords);
|
||||||
|
|
||||||
|
$vectorizer->fit($samples);
|
||||||
|
$this->assertEquals($vocabulary, $vectorizer->getVocabulary());
|
||||||
|
|
||||||
|
$vectorizer->transform($samples);
|
||||||
|
$this->assertEquals($tokensCounts, $samples);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user