implement StopWords in TokenCountVectorizer

2025-02-13 09:28:44 +00:00 · 2016-07-06 23:22:29 +02:00 · 2016-07-06 23:22:29 +02:00 · cce68997a1
commit cce68997a1
parent a2aa27adba
2 changed files with 70 additions and 5 deletions
--- a/src/Phpml/FeatureExtraction/TokenCountVectorizer.php
+++ b/src/Phpml/FeatureExtraction/TokenCountVectorizer.php
@ -14,6 +14,11 @@ class TokenCountVectorizer implements Transformer
     */
    private $tokenizer;

+    /**
+     * @var StopWords
+     */
+    private $stopWords;
+
    /**
     * @var float
     */
@ -31,12 +36,15 @@ class TokenCountVectorizer implements Transformer

    /**
     * @param Tokenizer $tokenizer
+     * @param StopWords $stopWords
     * @param float     $minDF
     */
-    public function __construct(Tokenizer $tokenizer, float $minDF = 0)
+    public function __construct(Tokenizer $tokenizer, StopWords $stopWords = null, float $minDF = 0)
    {
        $this->tokenizer = $tokenizer;
+        $this->stopWords = $stopWords;
        $this->minDF = $minDF;
+
        $this->vocabulary = [];
        $this->frequencies = [];
    }
@ -118,6 +126,10 @@ class TokenCountVectorizer implements Transformer
     */
    private function getTokenIndex(string $token)
    {
+        if ($this->isStopWord($token)) {
+            return false;
+        }
+
        return isset($this->vocabulary[$token]) ? $this->vocabulary[$token] : false;
    }

@ -126,11 +138,25 @@ class TokenCountVectorizer implements Transformer
     */
    private function addTokenToVocabulary(string $token)
    {
+        if ($this->isStopWord($token)) {
+            return;
+        }
+
        if (!isset($this->vocabulary[$token])) {
            $this->vocabulary[$token] = count($this->vocabulary);
        }
    }

+    /**
+     * @param string $token
+     *
+     * @return bool
+     */
+    private function isStopWord(string $token): bool
+    {
+        return $this->stopWords && $this->stopWords->isStopWord($token);
+    }
+
    /**
     * @param string $token
     */
--- a/tests/Phpml/FeatureExtraction/TokenCountVectorizerTest.php
+++ b/tests/Phpml/FeatureExtraction/TokenCountVectorizerTest.php
@ -4,12 +4,13 @@ declare (strict_types = 1);

 namespace tests\Phpml\FeatureExtraction;

+use Phpml\FeatureExtraction\StopWords;
 use Phpml\FeatureExtraction\TokenCountVectorizer;
 use Phpml\Tokenization\WhitespaceTokenizer;

 class TokenCountVectorizerTest extends \PHPUnit_Framework_TestCase
 {
-    public function testTokenCountVectorizerWithWhitespaceTokenizer()
+    public function testTransformationWithWhitespaceTokenizer()
    {
        $samples = [
            'Lorem ipsum dolor sit amet dolor',
@ -45,7 +46,7 @@ class TokenCountVectorizerTest extends \PHPUnit_Framework_TestCase
        $this->assertEquals($tokensCounts, $samples);
    }

-    public function testMinimumDocumentTokenCountFrequency()
+    public function testTransformationWithMinimumDocumentTokenCountFrequency()
    {
        // word at least in half samples
        $samples = [
@ -70,7 +71,7 @@ class TokenCountVectorizerTest extends \PHPUnit_Framework_TestCase
            [0 => 0, 1 => 1, 2 => 0, 3 => 1, 4 => 1],
        ];

-        $vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), 0.5);
+        $vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), null, 0.5);

        $vectorizer->fit($samples);
        $this->assertEquals($vocabulary, $vectorizer->getVocabulary());
@ -91,10 +92,48 @@ class TokenCountVectorizerTest extends \PHPUnit_Framework_TestCase
            [0 => 1, 1 => 0, 2 => 0, 3 => 0, 4 => 0, 5 => 0, 6 => 0, 7 => 0, 8 => 0],
        ];

-        $vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), 1);
+        $vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), null, 1);
        $vectorizer->fit($samples);
        $vectorizer->transform($samples);

        $this->assertEquals($tokensCounts, $samples);
    }
+
+    public function testTransformationWithStopWords()
+    {
+        $samples = [
+            'Lorem ipsum dolor sit amet dolor',
+            'Mauris placerat ipsum dolor',
+            'Mauris diam eros fringilla diam',
+        ];
+
+        $stopWords = new StopWords(['dolor', 'diam']);
+
+        $vocabulary = [
+            0 => 'Lorem',
+            1 => 'ipsum',
+            //2 => 'dolor',
+            2 => 'sit',
+            3 => 'amet',
+            4 => 'Mauris',
+            5 => 'placerat',
+            //7 => 'diam',
+            6 => 'eros',
+            7 => 'fringilla',
+        ];
+
+        $tokensCounts = [
+            [0 => 1, 1 => 1, 2 => 1, 3 => 1, 4 => 0, 5 => 0, 6 => 0, 7 => 0],
+            [0 => 0, 1 => 1, 2 => 0, 3 => 0, 4 => 1, 5 => 1, 6 => 0, 7 => 0],
+            [0 => 0, 1 => 0, 2 => 0, 3 => 0, 4 => 1, 5 => 0, 6 => 1, 7 => 1],
+        ];
+
+        $vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), $stopWords);
+
+        $vectorizer->fit($samples);
+        $this->assertEquals($vocabulary, $vectorizer->getVocabulary());
+
+        $vectorizer->transform($samples);
+        $this->assertEquals($tokensCounts, $samples);
+    }
 }