mirror of
https://github.com/Llewellynvdm/php-ml.git
synced 2025-01-23 15:18:24 +00:00
add word tokenizer
This commit is contained in:
parent
078f543146
commit
46197eba7b
21
src/Phpml/Tokenization/WordTokenizer.php
Normal file
21
src/Phpml/Tokenization/WordTokenizer.php
Normal file
@ -0,0 +1,21 @@
|
||||
<?php
|
||||
|
||||
declare (strict_types = 1);
|
||||
|
||||
namespace Phpml\Tokenization;
|
||||
|
||||
class WordTokenizer implements Tokenizer
|
||||
{
|
||||
/**
|
||||
* @param string $text
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function tokenize(string $text): array
|
||||
{
|
||||
$tokens = [];
|
||||
preg_match_all('/\w\w+/u', $text, $tokens);
|
||||
|
||||
return $tokens[0];
|
||||
}
|
||||
}
|
40
tests/Phpml/Tokenization/WordTokenizerTest.php
Normal file
40
tests/Phpml/Tokenization/WordTokenizerTest.php
Normal file
@ -0,0 +1,40 @@
|
||||
<?php
|
||||
|
||||
declare (strict_types = 1);
|
||||
|
||||
namespace tests\Tokenization;
|
||||
|
||||
use Phpml\Tokenization\WordTokenizer;
|
||||
|
||||
class WordTokenizerTest extends \PHPUnit_Framework_TestCase
|
||||
{
|
||||
public function testTokenizationOnAscii()
|
||||
{
|
||||
$tokenizer = new WordTokenizer();
|
||||
|
||||
$text = 'Lorem ipsum-dolor sit amet, consectetur/adipiscing elit.
|
||||
Cras consectetur, dui et lobortis;auctor.
|
||||
Nulla vitae ,.,/ congue lorem.';
|
||||
|
||||
$tokens = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', 'consectetur', 'adipiscing', 'elit',
|
||||
'Cras', 'consectetur', 'dui', 'et', 'lobortis', 'auctor',
|
||||
'Nulla', 'vitae', 'congue', 'lorem', ];
|
||||
|
||||
$this->assertEquals($tokens, $tokenizer->tokenize($text));
|
||||
}
|
||||
|
||||
public function testTokenizationOnUtf8()
|
||||
{
|
||||
$tokenizer = new WordTokenizer();
|
||||
|
||||
$text = '鋍鞎 鳼 鞮鞢騉 袟袘觕, 炟砏 蒮 謺貙蹖 偢偣唲 蒛 箷箯緷 鑴鱱爧 覮轀,
|
||||
剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢 銪 餀 枲柊氠 鍎鞚韕 焲犈,
|
||||
殍涾烰 齞齝囃 蹅輶 鄜, 孻憵 擙樲橚 藒襓謥 岯岪弨 蒮 廞徲 孻憵懥 趡趛踠 槏';
|
||||
|
||||
$tokens = ['鋍鞎', '鞮鞢騉', '袟袘觕', '炟砏', '謺貙蹖', '偢偣唲', '箷箯緷', '鑴鱱爧', '覮轀',
|
||||
'剆坲', '煘煓瑐', '鬐鶤鶐', '飹勫嫢', '枲柊氠', '鍎鞚韕', '焲犈',
|
||||
'殍涾烰', '齞齝囃', '蹅輶', '孻憵', '擙樲橚', '藒襓謥', '岯岪弨', '廞徲', '孻憵懥', '趡趛踠', ];
|
||||
|
||||
$this->assertEquals($tokens, $tokenizer->tokenize($text));
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user