create stop words class

This commit is contained in:
Arkadiusz Kondas 2016-06-14 11:54:04 +02:00
parent 1ac4b44ee4
commit da6d94cc46
4 changed files with 161 additions and 0 deletions

View File

@ -0,0 +1,51 @@
<?php
declare (strict_types = 1);
namespace Phpml\FeatureExtraction;
use Phpml\Exception\InvalidArgumentException;
class StopWords
{
/**
* @var array
*/
protected $stopWords;
/**
* @param array $stopWords
*/
public function __construct(array $stopWords)
{
$this->stopWords = array_fill_keys($stopWords, true);
}
/**
* @param string $token
*
* @return bool
*/
public function isStopWord(string $token): bool
{
return isset($this->stopWords[$token]);
}
/**
* @param string $language
*
* @return StopWords
*
* @throws InvalidArgumentException
*/
public static function factory($language = 'English'): StopWords
{
$className = __NAMESPACE__."\\StopWords\\$language";
if (!class_exists($className)) {
throw InvalidArgumentException::invalidStopWordsLanguage($language);
}
return new $className();
}
}

View File

@ -0,0 +1,33 @@
<?php
declare (strict_types = 1);
namespace Phpml\FeatureExtraction\StopWords;
use Phpml\FeatureExtraction\StopWords;
final class English extends StopWords
{
/**
* @var array
*/
protected $stopWords = [
'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'aren\'t', 'as', 'at', 'be', 'because',
'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can\'t', 'cannot', 'could', 'couldn\'t', 'did', 'didn\'t',
'do', 'does', 'doesn\'t', 'doing', 'don\'t', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn\'t', 'has',
'hasn\'t', 'have', 'haven\'t', 'having', 'he', 'he\'d', 'he\'ll', 'he\'s', 'her', 'here', 'here\'s', 'hers', 'herself', 'him',
'himself', 'his', 'how', 'how\'s', 'i', 'i\'d', 'i\'ll', 'i\'m', 'i\'ve', 'if', 'in', 'into', 'is', 'isn\'t', 'it', 'it\'s', 'its',
'itself', 'let\'s', 'me', 'more', 'most', 'mustn\'t', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or',
'other', 'ought', 'our', 'oursourselves', 'out', 'over', 'own', 'same', 'shan\'t', 'she', 'she\'d', 'she\'ll', 'she\'s', 'should',
'shouldn\'t', 'so', 'some', 'such', 'than', 'that', 'that\'s', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there',
'there\'s', 'these', 'they', 'they\'d', 'they\'ll', 'they\'re', 'they\'ve', 'this', 'those', 'through', 'to', 'too', 'under',
'until', 'up', 'very', 'was', 'wasn\'t', 'we', 'we\'d', 'we\'ll', 'we\'re', 'we\'ve', 'were', 'weren\'t', 'what', 'what\'s',
'when', 'when\'s', 'where', 'where\'s', 'which', 'while', 'who', 'who\'s', 'whom', 'why', 'why\'s', 'with', 'won\'t', 'would',
'wouldn\'t', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves',
];
public function __construct()
{
parent::__construct($this->stopWords);
}
}

View File

@ -0,0 +1,30 @@
<?php
declare (strict_types = 1);
namespace Phpml\FeatureExtraction\StopWords;
use Phpml\FeatureExtraction\StopWords;
final class Polish extends StopWords
{
/**
* @var array
*/
protected $stopWords = [
'ach', 'aj', 'albo', 'bardzo', 'bez', 'bo', 'być', 'ci', 'cię', 'ciebie', 'co', 'czy', 'daleko', 'dla', 'dlaczego', 'dlatego',
'do', 'dobrze', 'dokąd', 'dość', 'dużo', 'dwa', 'dwaj', 'dwie', 'dwoje', 'dziś', 'dzisiaj', 'gdyby', 'gdzie', 'go', 'ich', 'ile',
'im', 'inny', 'ja', 'ją', 'jak', 'jakby', 'jaki', 'je', 'jeden', 'jedna', 'jedno', 'jego', 'jej', 'jemu', 'jeśli', 'jest', 'jestem',
'jeżeli', 'już', 'każdy', 'kiedy', 'kierunku', 'kto', 'ku', 'lub', 'ma', 'mają', 'mam', 'mi', 'mną', 'mnie', 'moi', 'mój', 'moja',
'moje', 'może', 'mu', 'my', 'na', 'nam', 'nami', 'nas', 'nasi', 'nasz', 'nasza', 'nasze', 'natychmiast', 'nią', 'nic', 'nich',
'nie', 'niego', 'niej', 'niemu', 'nigdy', 'nim', 'nimi', 'niż', 'obok', 'od', 'około', 'on', 'ona', 'one', 'oni', 'ono', 'owszem',
'po', 'pod', 'ponieważ', 'przed', 'przedtem', 'są', 'sam', 'sama', 'się', 'skąd', 'tak', 'taki', 'tam', 'ten', 'to', 'tobą', 'tobie',
'tu', 'tutaj', 'twoi', 'twój', 'twoja', 'twoje', 'ty', 'wam', 'wami', 'was', 'wasi', 'wasz', 'wasza', 'wasze', 'we', 'więc',
'wszystko', 'wtedy', 'wy', 'żaden', 'zawsze', 'że',
];
public function __construct()
{
parent::__construct($this->stopWords);
}
}

View File

@ -0,0 +1,47 @@
<?php
declare (strict_types = 1);
namespace tests\Phpml\FeatureExtraction;
use Phpml\FeatureExtraction\StopWords;
class StopWordsTest extends \PHPUnit_Framework_TestCase
{
public function testCustomStopWords()
{
$stopWords = new StopWords(['lorem', 'ipsum', 'dolor']);
$this->assertTrue($stopWords->isStopWord('lorem'));
$this->assertTrue($stopWords->isStopWord('ipsum'));
$this->assertTrue($stopWords->isStopWord('dolor'));
$this->assertFalse($stopWords->isStopWord('consectetur'));
$this->assertFalse($stopWords->isStopWord('adipiscing'));
$this->assertFalse($stopWords->isStopWord('amet'));
}
/**
* @expectedException \Phpml\Exception\InvalidArgumentException
*/
public function testThrowExceptionOnInvalidLanguage()
{
StopWords::factory('Lorem');
}
public function testEnglishStopWords()
{
$stopWords = StopWords::factory('English');
$this->assertTrue($stopWords->isStopWord('again'));
$this->assertFalse($stopWords->isStopWord('strategy'));
}
public function testPolishStopWords()
{
$stopWords = StopWords::factory('Polish');
$this->assertTrue($stopWords->isStopWord('wam'));
$this->assertFalse($stopWords->isStopWord('transhumanizm'));
}
}