mirror of
https://github.com/Llewellynvdm/php-ml.git
synced 2025-01-09 00:20:53 +00:00
create stop words class
This commit is contained in:
parent
1ac4b44ee4
commit
da6d94cc46
51
src/Phpml/FeatureExtraction/StopWords.php
Normal file
51
src/Phpml/FeatureExtraction/StopWords.php
Normal file
@ -0,0 +1,51 @@
|
||||
<?php
|
||||
|
||||
declare (strict_types = 1);
|
||||
|
||||
namespace Phpml\FeatureExtraction;
|
||||
|
||||
use Phpml\Exception\InvalidArgumentException;
|
||||
|
||||
class StopWords
|
||||
{
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
protected $stopWords;
|
||||
|
||||
/**
|
||||
* @param array $stopWords
|
||||
*/
|
||||
public function __construct(array $stopWords)
|
||||
{
|
||||
$this->stopWords = array_fill_keys($stopWords, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $token
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function isStopWord(string $token): bool
|
||||
{
|
||||
return isset($this->stopWords[$token]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $language
|
||||
*
|
||||
* @return StopWords
|
||||
*
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public static function factory($language = 'English'): StopWords
|
||||
{
|
||||
$className = __NAMESPACE__."\\StopWords\\$language";
|
||||
|
||||
if (!class_exists($className)) {
|
||||
throw InvalidArgumentException::invalidStopWordsLanguage($language);
|
||||
}
|
||||
|
||||
return new $className();
|
||||
}
|
||||
}
|
33
src/Phpml/FeatureExtraction/StopWords/English.php
Normal file
33
src/Phpml/FeatureExtraction/StopWords/English.php
Normal file
@ -0,0 +1,33 @@
|
||||
<?php
|
||||
|
||||
declare (strict_types = 1);
|
||||
|
||||
namespace Phpml\FeatureExtraction\StopWords;
|
||||
|
||||
use Phpml\FeatureExtraction\StopWords;
|
||||
|
||||
final class English extends StopWords
|
||||
{
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
protected $stopWords = [
|
||||
'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'aren\'t', 'as', 'at', 'be', 'because',
|
||||
'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can\'t', 'cannot', 'could', 'couldn\'t', 'did', 'didn\'t',
|
||||
'do', 'does', 'doesn\'t', 'doing', 'don\'t', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn\'t', 'has',
|
||||
'hasn\'t', 'have', 'haven\'t', 'having', 'he', 'he\'d', 'he\'ll', 'he\'s', 'her', 'here', 'here\'s', 'hers', 'herself', 'him',
|
||||
'himself', 'his', 'how', 'how\'s', 'i', 'i\'d', 'i\'ll', 'i\'m', 'i\'ve', 'if', 'in', 'into', 'is', 'isn\'t', 'it', 'it\'s', 'its',
|
||||
'itself', 'let\'s', 'me', 'more', 'most', 'mustn\'t', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or',
|
||||
'other', 'ought', 'our', 'oursourselves', 'out', 'over', 'own', 'same', 'shan\'t', 'she', 'she\'d', 'she\'ll', 'she\'s', 'should',
|
||||
'shouldn\'t', 'so', 'some', 'such', 'than', 'that', 'that\'s', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there',
|
||||
'there\'s', 'these', 'they', 'they\'d', 'they\'ll', 'they\'re', 'they\'ve', 'this', 'those', 'through', 'to', 'too', 'under',
|
||||
'until', 'up', 'very', 'was', 'wasn\'t', 'we', 'we\'d', 'we\'ll', 'we\'re', 'we\'ve', 'were', 'weren\'t', 'what', 'what\'s',
|
||||
'when', 'when\'s', 'where', 'where\'s', 'which', 'while', 'who', 'who\'s', 'whom', 'why', 'why\'s', 'with', 'won\'t', 'would',
|
||||
'wouldn\'t', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves',
|
||||
];
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
parent::__construct($this->stopWords);
|
||||
}
|
||||
}
|
30
src/Phpml/FeatureExtraction/StopWords/Polish.php
Normal file
30
src/Phpml/FeatureExtraction/StopWords/Polish.php
Normal file
@ -0,0 +1,30 @@
|
||||
<?php
|
||||
|
||||
declare (strict_types = 1);
|
||||
|
||||
namespace Phpml\FeatureExtraction\StopWords;
|
||||
|
||||
use Phpml\FeatureExtraction\StopWords;
|
||||
|
||||
final class Polish extends StopWords
|
||||
{
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
protected $stopWords = [
|
||||
'ach', 'aj', 'albo', 'bardzo', 'bez', 'bo', 'być', 'ci', 'cię', 'ciebie', 'co', 'czy', 'daleko', 'dla', 'dlaczego', 'dlatego',
|
||||
'do', 'dobrze', 'dokąd', 'dość', 'dużo', 'dwa', 'dwaj', 'dwie', 'dwoje', 'dziś', 'dzisiaj', 'gdyby', 'gdzie', 'go', 'ich', 'ile',
|
||||
'im', 'inny', 'ja', 'ją', 'jak', 'jakby', 'jaki', 'je', 'jeden', 'jedna', 'jedno', 'jego', 'jej', 'jemu', 'jeśli', 'jest', 'jestem',
|
||||
'jeżeli', 'już', 'każdy', 'kiedy', 'kierunku', 'kto', 'ku', 'lub', 'ma', 'mają', 'mam', 'mi', 'mną', 'mnie', 'moi', 'mój', 'moja',
|
||||
'moje', 'może', 'mu', 'my', 'na', 'nam', 'nami', 'nas', 'nasi', 'nasz', 'nasza', 'nasze', 'natychmiast', 'nią', 'nic', 'nich',
|
||||
'nie', 'niego', 'niej', 'niemu', 'nigdy', 'nim', 'nimi', 'niż', 'obok', 'od', 'około', 'on', 'ona', 'one', 'oni', 'ono', 'owszem',
|
||||
'po', 'pod', 'ponieważ', 'przed', 'przedtem', 'są', 'sam', 'sama', 'się', 'skąd', 'tak', 'taki', 'tam', 'ten', 'to', 'tobą', 'tobie',
|
||||
'tu', 'tutaj', 'twoi', 'twój', 'twoja', 'twoje', 'ty', 'wam', 'wami', 'was', 'wasi', 'wasz', 'wasza', 'wasze', 'we', 'więc',
|
||||
'wszystko', 'wtedy', 'wy', 'żaden', 'zawsze', 'że',
|
||||
];
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
parent::__construct($this->stopWords);
|
||||
}
|
||||
}
|
47
tests/Phpml/FeatureExtraction/StopWordsTest.php
Normal file
47
tests/Phpml/FeatureExtraction/StopWordsTest.php
Normal file
@ -0,0 +1,47 @@
|
||||
<?php
|
||||
|
||||
declare (strict_types = 1);
|
||||
|
||||
namespace tests\Phpml\FeatureExtraction;
|
||||
|
||||
use Phpml\FeatureExtraction\StopWords;
|
||||
|
||||
class StopWordsTest extends \PHPUnit_Framework_TestCase
|
||||
{
|
||||
public function testCustomStopWords()
|
||||
{
|
||||
$stopWords = new StopWords(['lorem', 'ipsum', 'dolor']);
|
||||
|
||||
$this->assertTrue($stopWords->isStopWord('lorem'));
|
||||
$this->assertTrue($stopWords->isStopWord('ipsum'));
|
||||
$this->assertTrue($stopWords->isStopWord('dolor'));
|
||||
|
||||
$this->assertFalse($stopWords->isStopWord('consectetur'));
|
||||
$this->assertFalse($stopWords->isStopWord('adipiscing'));
|
||||
$this->assertFalse($stopWords->isStopWord('amet'));
|
||||
}
|
||||
|
||||
/**
|
||||
* @expectedException \Phpml\Exception\InvalidArgumentException
|
||||
*/
|
||||
public function testThrowExceptionOnInvalidLanguage()
|
||||
{
|
||||
StopWords::factory('Lorem');
|
||||
}
|
||||
|
||||
public function testEnglishStopWords()
|
||||
{
|
||||
$stopWords = StopWords::factory('English');
|
||||
|
||||
$this->assertTrue($stopWords->isStopWord('again'));
|
||||
$this->assertFalse($stopWords->isStopWord('strategy'));
|
||||
}
|
||||
|
||||
public function testPolishStopWords()
|
||||
{
|
||||
$stopWords = StopWords::factory('Polish');
|
||||
|
||||
$this->assertTrue($stopWords->isStopWord('wam'));
|
||||
$this->assertFalse($stopWords->isStopWord('transhumanizm'));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user