Provide a new NGramTokenizer with minGram and maxGram support (#350)

* Issue #349: Provide a new NGramTokenizer.

* Issue #349: Add tests.

* Fixes from code review.

* Implement NGramTokenizer with min and max gram support

* Add missing tests for ngram

* Add info about NGramTokenizer to docs and readme

* Add performance test for tokenization
This commit is contained in:
Pol Dellaiera 2019-02-15 17:31:10 +01:00 committed by Arkadiusz Kondas
parent b3fe9dae1e
commit 02dab41830
8 changed files with 246 additions and 27 deletions

View File

@ -102,6 +102,9 @@ Public datasets are available in a separate repository [php-ai/php-ml-datasets](
* [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values/) * [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values/)
* Feature Extraction * Feature Extraction
* [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/) * [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/)
* NGramTokenizer
* WhitespaceTokenizer
* WordTokenizer
* [Tf-idf Transformer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/tf-idf-transformer/) * [Tf-idf Transformer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/tf-idf-transformer/)
* Dimensionality Reduction * Dimensionality Reduction
* PCA (Principal Component Analysis) * PCA (Principal Component Analysis)

View File

@ -53,3 +53,21 @@ $vectorizer->getVocabulary();
* WhitespaceTokenizer - select tokens by whitespace. * WhitespaceTokenizer - select tokens by whitespace.
* WordTokenizer - select tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator). * WordTokenizer - select tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator).
* NGramTokenizer - continuous sequence of characters of the specified length. They are useful for querying languages that dont use spaces or that have long compound words, like German.
**NGramTokenizer**
The NGramTokenizer tokenizer accepts the following parameters:
`$minGram` - minimum length of characters in a gram. Defaults to 1.
`$maxGram` - maximum length of characters in a gram. Defaults to 2.
```php
use Phpml\Tokenization\NGramTokenizer;
$tokenizer = new NGramTokenizer(1, 2);
$tokenizer->tokenize('Quick Fox');
// returns ['Q', 'u', 'i', 'c', 'k', 'Qu', 'ui', 'ic', 'ck', 'F', 'o', 'x', 'Fo', 'ox']
```

View File

@ -0,0 +1,59 @@
<?php
declare(strict_types=1);
namespace Phpml\Tokenization;
use Phpml\Exception\InvalidArgumentException;
class NGramTokenizer extends WordTokenizer
{
/**
* @var int
*/
private $minGram;
/**
* @var int
*/
private $maxGram;
public function __construct(int $minGram = 1, int $maxGram = 2)
{
if ($minGram < 1 || $maxGram < 1 || $minGram > $maxGram) {
throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram));
}
$this->minGram = $minGram;
$this->maxGram = $maxGram;
}
/**
* {@inheritdoc}
*/
public function tokenize(string $text): array
{
$words = [];
preg_match_all('/\w\w+/u', $text, $words);
$nGrams = [];
foreach ($words[0] as $word) {
$this->generateNGrams($word, $nGrams);
}
return $nGrams;
}
private function generateNGrams(string $word, array &$nGrams): void
{
$length = mb_strlen($word);
for ($j = 1; $j <= $this->maxGram; $j++) {
for ($k = 0; $k < $length - $j + 1; $k++) {
if ($j >= $this->minGram) {
$nGrams[] = mb_substr($word, $k, $j);
}
}
}
}
}

View File

@ -0,0 +1,33 @@
<?php
declare(strict_types=1);
namespace Phpml\Tests\Performance\Tokenization;
use PhpBench\Benchmark\Metadata\Annotations\Iterations;
use PhpBench\Benchmark\Metadata\Annotations\Revs;
use Phpml\Tokenization\NGramTokenizer;
final class NGramTokenizerBench
{
/**
* @Revs(1000)
* @Iterations(5)
*/
public function benchSimpleTokenizer(): void
{
$tokenizer = new NGramTokenizer(2, 3);
$tokenizer->tokenize(
'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent placerat blandit cursus. Suspendisse sed
turpis sit amet enim viverra sodales a euismod est. Ut vitae tincidunt est. Proin venenatis placerat nunc
sed ornare. Etiam feugiat, nisl nec sollicitudin sodales, nulla massa sollicitudin ipsum, vitae cursus ante
velit vitae arcu. Vestibulum feugiat ultricies hendrerit. Morbi sed varius metus. Nam feugiat maximus
turpis, a sollicitudin ligula porttitor eu.Fusce hendrerit tellus et dignissim sagittis. Nulla consectetur
condimentum tortor, non bibendum erat lacinia eget. Integer vitae maximus tortor. Vestibulum ante ipsum
primis in faucibus orci luctus et ultrices posuere cubilia Curae; Pellentesque suscipit sem ipsum, in
tincidunt risus pellentesque vel. Nullam hendrerit consequat leo, in suscipit lectus euismod non. Cras arcu
lacus, lacinia semper mauris vel, pharetra dignissim velit. Nam lacinia turpis a nibh bibendum, et
placerat tellus accumsan. Sed tincidunt cursus nisi in laoreet. Suspendisse amet.'
);
}
}

View File

@ -0,0 +1,100 @@
<?php
declare(strict_types=1);
namespace Phpml\Tests\Tokenization;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Tokenization\NGramTokenizer;
/**
* Inspiration: https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-ngram-tokenizer.html
*/
class NGramTokenizerTest extends TokenizerTest
{
/**
* @dataProvider textDataProvider
*/
public function testNGramTokenization(int $minGram, int $maxGram, string $text, array $tokens): void
{
$tokenizer = new NGramTokenizer($minGram, $maxGram);
self::assertEquals($tokens, $tokenizer->tokenize($text));
}
public function testMinGramGreaterThanMaxGramNotAllowed(): void
{
self::expectException(InvalidArgumentException::class);
new NGramTokenizer(5, 2);
}
public function testMinGramValueTooSmall(): void
{
self::expectException(InvalidArgumentException::class);
new NGramTokenizer(0, 2);
}
public function testMaxGramValueTooSmall(): void
{
self::expectException(InvalidArgumentException::class);
new NGramTokenizer(1, 0);
}
public function textDataProvider(): array
{
return [
[
1, 2,
'Quick Fox',
['Q', 'u', 'i', 'c', 'k', 'Qu', 'ui', 'ic', 'ck', 'F', 'o', 'x', 'Fo', 'ox'],
],
[
3, 3,
'Quick Foxes',
['Qui', 'uic', 'ick', 'Fox', 'oxe', 'xes'],
],
[
1, 2,
'快狐跑过 边缘跑',
['快', '狐', '跑', '过', '快狐', '狐跑', '跑过', '边', '缘', '跑', '边缘', '缘跑'],
],
[
3, 3,
'快狐跑过狐 边缘跑狐狐',
['快狐跑', '狐跑过', '跑过狐', '边缘跑', '缘跑狐', '跑狐狐'],
],
[
2, 4,
$this->getSimpleText(),
[
'Lo', 'or', 're', 'em', 'Lor', 'ore', 'rem', 'Lore', 'orem', 'ip', 'ps', 'su', 'um', 'ips', 'psu', 'sum', 'ipsu',
'psum', 'do', 'ol', 'lo', 'or', 'dol', 'olo', 'lor', 'dolo', 'olor', 'si', 'it', 'sit', 'am', 'me', 'et', 'ame',
'met', 'amet', 'co', 'on', 'ns', 'se', 'ec', 'ct', 'te', 'et', 'tu', 'ur', 'con', 'ons', 'nse', 'sec', 'ect', 'cte',
'tet', 'etu', 'tur', 'cons', 'onse', 'nsec', 'sect', 'ecte', 'ctet', 'tetu', 'etur', 'ad', 'di', 'ip', 'pi', 'is',
'sc', 'ci', 'in', 'ng', 'adi', 'dip', 'ipi', 'pis', 'isc', 'sci', 'cin', 'ing', 'adip', 'dipi', 'ipis', 'pisc',
'isci', 'scin', 'cing', 'el', 'li', 'it', 'eli', 'lit', 'elit', 'Cr', 'ra', 'as', 'Cra', 'ras', 'Cras', 'co', 'on',
'ns', 'se', 'ec', 'ct', 'te', 'et', 'tu', 'ur', 'con', 'ons', 'nse', 'sec', 'ect', 'cte', 'tet', 'etu', 'tur',
'cons', 'onse', 'nsec', 'sect', 'ecte', 'ctet', 'tetu', 'etur', 'du', 'ui', 'dui', 'et', 'lo', 'ob', 'bo', 'or',
'rt', 'ti', 'is', 'lob', 'obo', 'bor', 'ort', 'rti', 'tis', 'lobo', 'obor', 'bort', 'orti', 'rtis', 'au', 'uc',
'ct', 'to', 'or', 'auc', 'uct', 'cto', 'tor', 'auct', 'ucto', 'ctor', 'Nu', 'ul', 'll', 'la', 'Nul', 'ull', 'lla',
'Null', 'ulla', 'vi', 'it', 'ta', 'ae', 'vit', 'ita', 'tae', 'vita', 'itae', 'co', 'on', 'ng', 'gu', 'ue', 'con',
'ong', 'ngu', 'gue', 'cong', 'ongu', 'ngue', 'lo', 'or', 're', 'em', 'lor', 'ore', 'rem', 'lore', 'orem',
],
],
[
2, 4,
$this->getUtf8Text(),
[
'鋍鞎', '鞮鞢', '鞢騉', '鞮鞢騉', '袟袘', '袘觕', '袟袘觕', '炟砏', '謺貙', '貙蹖', '謺貙蹖', '偢偣', '偣唲',
'偢偣唲', '箷箯', '箯緷', '箷箯緷', '鑴鱱', '鱱爧', '鑴鱱爧', '覮轀', '剆坲', '煘煓', '煓瑐', '煘煓瑐', '鬐鶤',
'鶤鶐', '鬐鶤鶐', '飹勫', '勫嫢', '飹勫嫢', '枲柊', '柊氠', '枲柊氠', '鍎鞚', '鞚韕', '鍎鞚韕', '焲犈', '殍涾',
'涾烰', '殍涾烰', '齞齝', '齝囃', '齞齝囃', '蹅輶', '孻憵', '擙樲', '樲橚', '擙樲橚', '藒襓', '襓謥', '藒襓謥',
'岯岪', '岪弨', '岯岪弨', '廞徲', '孻憵', '憵懥', '孻憵懥', '趡趛', '趛踠', '趡趛踠',
],
],
];
}
}

View File

@ -0,0 +1,24 @@
<?php
declare(strict_types=1);
namespace Phpml\Tests\Tokenization;
use PHPUnit\Framework\TestCase;
abstract class TokenizerTest extends TestCase
{
public function getSimpleText(): string
{
return 'Lorem ipsum-dolor sit amet, consectetur/adipiscing elit.
Cras consectetur, dui et lobortis;auctor.
Nulla vitae ,.,/ congue lorem.';
}
public function getUtf8Text(): string
{
return '鋍鞎 鞮鞢騉 袟袘觕, 炟砏 謺貙蹖 偢偣唲 箷箯緷 鑴鱱爧 覮轀,
剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢 枲柊氠 鍎鞚韕 焲犈,
殍涾烰 齞齝囃 蹅輶 , 孻憵 擙樲橚 藒襓謥 岯岪弨 廞徲 孻憵懥 趡趛踠 ';
}
}

View File

@ -5,37 +5,28 @@ declare(strict_types=1);
namespace Phpml\Tests\Tokenization; namespace Phpml\Tests\Tokenization;
use Phpml\Tokenization\WhitespaceTokenizer; use Phpml\Tokenization\WhitespaceTokenizer;
use PHPUnit\Framework\TestCase;
class WhitespaceTokenizerTest extends TestCase class WhitespaceTokenizerTest extends TokenizerTest
{ {
public function testTokenizationOnAscii(): void public function testTokenizationOnAscii(): void
{ {
$tokenizer = new WhitespaceTokenizer(); $tokenizer = new WhitespaceTokenizer();
$text = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. $tokens = ['Lorem', 'ipsum-dolor', 'sit', 'amet,', 'consectetur/adipiscing', 'elit.',
Cras consectetur, dui et lobortis auctor. 'Cras', 'consectetur,', 'dui', 'et', 'lobortis;auctor.',
Nulla vitae congue lorem.'; 'Nulla', 'vitae', ',.,/', 'congue', 'lorem.', ];
$tokens = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet,', 'consectetur', 'adipiscing', 'elit.', self::assertEquals($tokens, $tokenizer->tokenize($this->getSimpleText()));
'Cras', 'consectetur,', 'dui', 'et', 'lobortis', 'auctor.',
'Nulla', 'vitae', 'congue', 'lorem.', ];
self::assertEquals($tokens, $tokenizer->tokenize($text));
} }
public function testTokenizationOnUtf8(): void public function testTokenizationOnUtf8(): void
{ {
$tokenizer = new WhitespaceTokenizer(); $tokenizer = new WhitespaceTokenizer();
$text = '鋍鞎 鞮鞢騉 袟袘觕, 炟砏 謺貙蹖 偢偣唲 箷箯緷 鑴鱱爧 覮轀,
剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢 枲柊氠 鍎鞚韕 焲犈,
殍涾烰 齞齝囃 蹅輶 , 孻憵 擙樲橚 藒襓謥 岯岪弨 廞徲 孻憵懥 趡趛踠 ';
$tokens = ['鋍鞎', '鳼', '鞮鞢騉', '袟袘觕,', '炟砏', '蒮', '謺貙蹖', '偢偣唲', '蒛', '箷箯緷', '鑴鱱爧', '覮轀,', $tokens = ['鋍鞎', '鳼', '鞮鞢騉', '袟袘觕,', '炟砏', '蒮', '謺貙蹖', '偢偣唲', '蒛', '箷箯緷', '鑴鱱爧', '覮轀,',
'剆坲', '煘煓瑐', '鬐鶤鶐', '飹勫嫢', '銪', '餀', '枲柊氠', '鍎鞚韕', '焲犈,', '剆坲', '煘煓瑐', '鬐鶤鶐', '飹勫嫢', '銪', '餀', '枲柊氠', '鍎鞚韕', '焲犈,',
'殍涾烰', '齞齝囃', '蹅輶', '鄜,', '孻憵', '擙樲橚', '藒襓謥', '岯岪弨', '蒮', '廞徲', '孻憵懥', '趡趛踠', '槏', ]; '殍涾烰', '齞齝囃', '蹅輶', '鄜,', '孻憵', '擙樲橚', '藒襓謥', '岯岪弨', '蒮', '廞徲', '孻憵懥', '趡趛踠', '槏', ];
self::assertEquals($tokens, $tokenizer->tokenize($text)); self::assertEquals($tokens, $tokenizer->tokenize($this->getUtf8Text()));
} }
} }

View File

@ -5,37 +5,28 @@ declare(strict_types=1);
namespace Phpml\Tests\Tokenization; namespace Phpml\Tests\Tokenization;
use Phpml\Tokenization\WordTokenizer; use Phpml\Tokenization\WordTokenizer;
use PHPUnit\Framework\TestCase;
class WordTokenizerTest extends TestCase class WordTokenizerTest extends TokenizerTest
{ {
public function testTokenizationOnAscii(): void public function testTokenizationOnAscii(): void
{ {
$tokenizer = new WordTokenizer(); $tokenizer = new WordTokenizer();
$text = 'Lorem ipsum-dolor sit amet, consectetur/adipiscing elit.
Cras consectetur, dui et lobortis;auctor.
Nulla vitae ,.,/ congue lorem.';
$tokens = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', 'consectetur', 'adipiscing', 'elit', $tokens = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', 'consectetur', 'adipiscing', 'elit',
'Cras', 'consectetur', 'dui', 'et', 'lobortis', 'auctor', 'Cras', 'consectetur', 'dui', 'et', 'lobortis', 'auctor',
'Nulla', 'vitae', 'congue', 'lorem', ]; 'Nulla', 'vitae', 'congue', 'lorem', ];
self::assertEquals($tokens, $tokenizer->tokenize($text)); self::assertEquals($tokens, $tokenizer->tokenize($this->getSimpleText()));
} }
public function testTokenizationOnUtf8(): void public function testTokenizationOnUtf8(): void
{ {
$tokenizer = new WordTokenizer(); $tokenizer = new WordTokenizer();
$text = '鋍鞎 鞮鞢騉 袟袘觕, 炟砏 謺貙蹖 偢偣唲 箷箯緷 鑴鱱爧 覮轀,
剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢 枲柊氠 鍎鞚韕 焲犈,
殍涾烰 齞齝囃 蹅輶 , 孻憵 擙樲橚 藒襓謥 岯岪弨 廞徲 孻憵懥 趡趛踠 ';
$tokens = ['鋍鞎', '鞮鞢騉', '袟袘觕', '炟砏', '謺貙蹖', '偢偣唲', '箷箯緷', '鑴鱱爧', '覮轀', $tokens = ['鋍鞎', '鞮鞢騉', '袟袘觕', '炟砏', '謺貙蹖', '偢偣唲', '箷箯緷', '鑴鱱爧', '覮轀',
'剆坲', '煘煓瑐', '鬐鶤鶐', '飹勫嫢', '枲柊氠', '鍎鞚韕', '焲犈', '剆坲', '煘煓瑐', '鬐鶤鶐', '飹勫嫢', '枲柊氠', '鍎鞚韕', '焲犈',
'殍涾烰', '齞齝囃', '蹅輶', '孻憵', '擙樲橚', '藒襓謥', '岯岪弨', '廞徲', '孻憵懥', '趡趛踠', ]; '殍涾烰', '齞齝囃', '蹅輶', '孻憵', '擙樲橚', '藒襓謥', '岯岪弨', '廞徲', '孻憵懥', '趡趛踠', ];
self::assertEquals($tokens, $tokenizer->tokenize($text)); self::assertEquals($tokens, $tokenizer->tokenize($this->getUtf8Text()));
} }
} }