diff --git a/docs/machine-learning/feature-extraction/token-count-vectorizer.md b/docs/machine-learning/feature-extraction/token-count-vectorizer.md index 8e2e9fd..4dc5260 100644 --- a/docs/machine-learning/feature-extraction/token-count-vectorizer.md +++ b/docs/machine-learning/feature-extraction/token-count-vectorizer.md @@ -71,3 +71,20 @@ $tokenizer->tokenize('Quick Fox'); // returns ['Q', 'u', 'i', 'c', 'k', 'Qu', 'ui', 'ic', 'ck', 'F', 'o', 'x', 'Fo', 'ox'] ``` + +**NGramWordTokenizer** + +The NGramWordTokenizer tokenizer accepts the following parameters: + +`$minGram` - minimum length of characters in a gram. Defaults to 1. +`$maxGram` - maximum length of characters in a gram. Defaults to 2. + +```php +use Phpml\Tokenization\NGramWordTokenizer; + +$tokenizer = new NGramWordTokenizer(1, 2); + +$tokenizer->tokenize('very quick fox'); + +// returns ['very', 'quick', 'fox', 'very quick', 'quick fox'] +``` diff --git a/src/Tokenization/NGramWordTokenizer.php b/src/Tokenization/NGramWordTokenizer.php new file mode 100644 index 0000000..20ee28c --- /dev/null +++ b/src/Tokenization/NGramWordTokenizer.php @@ -0,0 +1,64 @@ + $maxGram) { + throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram)); + } + + $this->minGram = $minGram; + $this->maxGram = $maxGram; + } + + /** + * {@inheritdoc} + */ + public function tokenize(string $text): array + { + preg_match_all('/\w\w+/u', $text, $words); + + $words = $words[0]; + + $nGrams = []; + for ($j = $this->minGram; $j <= $this->maxGram; $j++) { + $nGrams = array_merge($nGrams, $this->getNgrams($words, $j)); + } + + return $nGrams; + } + + private function getNgrams(array $match, int $n = 2): array + { + $ngrams = []; + $len = count($match); + for ($i = 0; $i < $len; $i++) { + if ($i > ($n - 2)) { + $ng = ''; + for ($j = $n - 1; $j >= 0; $j--) { + $ng .= ' '.$match[$i - $j]; + } + $ngrams[] = trim($ng); + } + } + + return $ngrams; + } +} diff --git a/tests/Tokenization/NGramWordTokenizerTest.php b/tests/Tokenization/NGramWordTokenizerTest.php new file mode 100644 index 0000000..f9986d5 --- /dev/null +++ b/tests/Tokenization/NGramWordTokenizerTest.php @@ -0,0 +1,112 @@ +tokenize($text)); + } + + public function testMinGramGreaterThanMaxGramNotAllowed(): void + { + self::expectException(InvalidArgumentException::class); + + new NGramWordTokenizer(5, 2); + } + + public function testMinGramValueTooSmall(): void + { + self::expectException(InvalidArgumentException::class); + + new NGramWordTokenizer(0, 2); + } + + public function testMaxGramValueTooSmall(): void + { + self::expectException(InvalidArgumentException::class); + + new NGramWordTokenizer(1, 0); + } + + public function textDataProvider(): array + { + return [ + [ + 1, 1, + 'one two three four', + ['one', 'two', 'three', 'four'], + ], + [ + 1, 2, + 'one two three four', + ['one', 'two', 'three', 'four', 'one two', 'two three', 'three four'], + ], + [ + 1, 3, + 'one two three four', + ['one', 'two', 'three', 'four', 'one two', 'two three', 'three four', 'one two three', 'two three four'], + ], + [ + 2, 3, + 'one two three four', + ['one two', 'two three', 'three four', 'one two three', 'two three four'], + ], + [ + 1, 2, + '快狐跑过 边缘跑', + ['快狐跑过', '边缘跑', '快狐跑过 边缘跑'], + ], + [ + 2, 4, + $this->getSimpleText(), + [ + 'Lorem ipsum', 'ipsum dolor', 'dolor sit', 'sit amet', 'amet consectetur', 'consectetur adipiscing', + 'adipiscing elit', 'elit Cras', 'Cras consectetur', 'consectetur dui', 'dui et', 'et lobortis', + 'lobortis auctor', 'auctor Nulla', 'Nulla vitae', 'vitae congue', 'congue lorem', 'Lorem ipsum dolor', + 'ipsum dolor sit', 'dolor sit amet', 'sit amet consectetur', 'amet consectetur adipiscing', + 'consectetur adipiscing elit', 'adipiscing elit Cras', 'elit Cras consectetur', 'Cras consectetur dui', + 'consectetur dui et', 'dui et lobortis', 'et lobortis auctor', 'lobortis auctor Nulla', 'auctor Nulla vitae', + 'Nulla vitae congue', 'vitae congue lorem', 'Lorem ipsum dolor sit', 'ipsum dolor sit amet', + 'dolor sit amet consectetur', 'sit amet consectetur adipiscing', 'amet consectetur adipiscing elit', + 'consectetur adipiscing elit Cras', 'adipiscing elit Cras consectetur', 'elit Cras consectetur dui', + 'Cras consectetur dui et', 'consectetur dui et lobortis', 'dui et lobortis auctor', 'et lobortis auctor Nulla', + 'lobortis auctor Nulla vitae', 'auctor Nulla vitae congue', 'Nulla vitae congue lorem', + ], + ], + [ + 2, 4, + $this->getUtf8Text(), + [ + '鋍鞎 鞮鞢騉', '鞮鞢騉 袟袘觕', '袟袘觕 炟砏', '炟砏 謺貙蹖', '謺貙蹖 偢偣唲', '偢偣唲 箷箯緷', '箷箯緷 鑴鱱爧', '鑴鱱爧 覮轀', + '覮轀 剆坲', '剆坲 煘煓瑐', '煘煓瑐 鬐鶤鶐', '鬐鶤鶐 飹勫嫢', '飹勫嫢 枲柊氠', '枲柊氠 鍎鞚韕', '鍎鞚韕 焲犈', '焲犈 殍涾烰', + '殍涾烰 齞齝囃', '齞齝囃 蹅輶', '蹅輶 孻憵', '孻憵 擙樲橚', '擙樲橚 藒襓謥', '藒襓謥 岯岪弨', '岯岪弨 廞徲', '廞徲 孻憵懥', + '孻憵懥 趡趛踠', '鋍鞎 鞮鞢騉 袟袘觕', '鞮鞢騉 袟袘觕 炟砏', '袟袘觕 炟砏 謺貙蹖', '炟砏 謺貙蹖 偢偣唲', '謺貙蹖 偢偣唲 箷箯緷', + '偢偣唲 箷箯緷 鑴鱱爧', '箷箯緷 鑴鱱爧 覮轀', '鑴鱱爧 覮轀 剆坲', '覮轀 剆坲 煘煓瑐', '剆坲 煘煓瑐 鬐鶤鶐', '煘煓瑐 鬐鶤鶐 飹勫嫢', + '鬐鶤鶐 飹勫嫢 枲柊氠', '飹勫嫢 枲柊氠 鍎鞚韕', '枲柊氠 鍎鞚韕 焲犈', '鍎鞚韕 焲犈 殍涾烰', '焲犈 殍涾烰 齞齝囃', '殍涾烰 齞齝囃 蹅輶', + '齞齝囃 蹅輶 孻憵', '蹅輶 孻憵 擙樲橚', '孻憵 擙樲橚 藒襓謥', '擙樲橚 藒襓謥 岯岪弨', '藒襓謥 岯岪弨 廞徲', '岯岪弨 廞徲 孻憵懥', + '廞徲 孻憵懥 趡趛踠', '鋍鞎 鞮鞢騉 袟袘觕 炟砏', '鞮鞢騉 袟袘觕 炟砏 謺貙蹖', '袟袘觕 炟砏 謺貙蹖 偢偣唲', '炟砏 謺貙蹖 偢偣唲 箷箯緷', + '謺貙蹖 偢偣唲 箷箯緷 鑴鱱爧', '偢偣唲 箷箯緷 鑴鱱爧 覮轀', '箷箯緷 鑴鱱爧 覮轀 剆坲', '鑴鱱爧 覮轀 剆坲 煘煓瑐', + '覮轀 剆坲 煘煓瑐 鬐鶤鶐', '剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢', '煘煓瑐 鬐鶤鶐 飹勫嫢 枲柊氠', '鬐鶤鶐 飹勫嫢 枲柊氠 鍎鞚韕', + '飹勫嫢 枲柊氠 鍎鞚韕 焲犈', '枲柊氠 鍎鞚韕 焲犈 殍涾烰', '鍎鞚韕 焲犈 殍涾烰 齞齝囃', '焲犈 殍涾烰 齞齝囃 蹅輶', + '殍涾烰 齞齝囃 蹅輶 孻憵', '齞齝囃 蹅輶 孻憵 擙樲橚', '蹅輶 孻憵 擙樲橚 藒襓謥', '孻憵 擙樲橚 藒襓謥 岯岪弨', '擙樲橚 藒襓謥 岯岪弨 廞徲', + '藒襓謥 岯岪弨 廞徲 孻憵懥', '岯岪弨 廞徲 孻憵懥 趡趛踠', + ], + ], + ]; + } +}