2016-05-03 00:33:18 +02:00
|
|
|
<?php
|
|
|
|
|
2016-11-20 22:53:17 +01:00
|
|
|
declare(strict_types=1);
|
2016-05-03 00:33:18 +02:00
|
|
|
|
2018-01-06 13:09:33 +01:00
|
|
|
namespace Phpml\Tests\Tokenization;
|
2016-05-03 00:33:18 +02:00
|
|
|
|
|
|
|
use Phpml\Tokenization\WhitespaceTokenizer;
|
|
|
|
|
2019-02-15 17:31:10 +01:00
|
|
|
class WhitespaceTokenizerTest extends TokenizerTest
|
2016-05-03 00:33:18 +02:00
|
|
|
{
|
2017-11-14 21:21:23 +01:00
|
|
|
public function testTokenizationOnAscii(): void
|
2016-05-03 00:33:18 +02:00
|
|
|
{
|
|
|
|
$tokenizer = new WhitespaceTokenizer();
|
|
|
|
|
2019-02-15 17:31:10 +01:00
|
|
|
$tokens = ['Lorem', 'ipsum-dolor', 'sit', 'amet,', 'consectetur/adipiscing', 'elit.',
|
|
|
|
'Cras', 'consectetur,', 'dui', 'et', 'lobortis;auctor.',
|
|
|
|
'Nulla', 'vitae', ',.,/', 'congue', 'lorem.', ];
|
2016-05-03 00:33:18 +02:00
|
|
|
|
2019-02-15 17:31:10 +01:00
|
|
|
self::assertEquals($tokens, $tokenizer->tokenize($this->getSimpleText()));
|
2016-05-03 00:33:18 +02:00
|
|
|
}
|
|
|
|
|
2017-11-14 21:21:23 +01:00
|
|
|
public function testTokenizationOnUtf8(): void
|
2016-05-03 00:33:18 +02:00
|
|
|
{
|
|
|
|
$tokenizer = new WhitespaceTokenizer();
|
|
|
|
|
|
|
|
$tokens = ['鋍鞎', '鳼', '鞮鞢騉', '袟袘觕,', '炟砏', '蒮', '謺貙蹖', '偢偣唲', '蒛', '箷箯緷', '鑴鱱爧', '覮轀,',
|
2018-06-15 07:57:45 +02:00
|
|
|
'剆坲', '煘煓瑐', '鬐鶤鶐', '飹勫嫢', '銪', '餀', '枲柊氠', '鍎鞚韕', '焲犈,',
|
|
|
|
'殍涾烰', '齞齝囃', '蹅輶', '鄜,', '孻憵', '擙樲橚', '藒襓謥', '岯岪弨', '蒮', '廞徲', '孻憵懥', '趡趛踠', '槏', ];
|
2016-05-03 00:33:18 +02:00
|
|
|
|
2019-02-15 17:31:10 +01:00
|
|
|
self::assertEquals($tokens, $tokenizer->tokenize($this->getUtf8Text()));
|
2016-05-03 00:33:18 +02:00
|
|
|
}
|
|
|
|
}
|