mirror of
https://github.com/Llewellynvdm/php-ml.git
synced 2024-11-05 13:07:52 +00:00
a348111e97
* tests: update to PHPUnit 6.0 with rector * fix namespaces on tests * composer + tests: use standard test namespace naming * update travis * resolve conflict * phpstan lvl 2 * phpstan lvl 3 * phpstan lvl 4 * phpstan lvl 5 * phpstan lvl 6 * phpstan lvl 7 * level max * resolve conflict * [cs] clean empty docs * composer: bump to PHPUnit 6.4 * cleanup * composer + travis: add phpstan * phpstan lvl 1 * composer: update dev deps * phpstan fixes * update Contributing with new tools * docs: link fixes, PHP version update * composer: drop php-cs-fixer, cs already handled by ecs * ecs: add old set rules * [cs] apply rest of rules
42 lines
1.6 KiB
PHP
42 lines
1.6 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace Phpml\Tests\Tokenization;
|
|
|
|
use Phpml\Tokenization\WordTokenizer;
|
|
use PHPUnit\Framework\TestCase;
|
|
|
|
class WordTokenizerTest extends TestCase
|
|
{
|
|
public function testTokenizationOnAscii(): void
|
|
{
|
|
$tokenizer = new WordTokenizer();
|
|
|
|
$text = 'Lorem ipsum-dolor sit amet, consectetur/adipiscing elit.
|
|
Cras consectetur, dui et lobortis;auctor.
|
|
Nulla vitae ,.,/ congue lorem.';
|
|
|
|
$tokens = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', 'consectetur', 'adipiscing', 'elit',
|
|
'Cras', 'consectetur', 'dui', 'et', 'lobortis', 'auctor',
|
|
'Nulla', 'vitae', 'congue', 'lorem', ];
|
|
|
|
$this->assertEquals($tokens, $tokenizer->tokenize($text));
|
|
}
|
|
|
|
public function testTokenizationOnUtf8(): void
|
|
{
|
|
$tokenizer = new WordTokenizer();
|
|
|
|
$text = '鋍鞎 鳼 鞮鞢騉 袟袘觕, 炟砏 蒮 謺貙蹖 偢偣唲 蒛 箷箯緷 鑴鱱爧 覮轀,
|
|
剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢 銪 餀 枲柊氠 鍎鞚韕 焲犈,
|
|
殍涾烰 齞齝囃 蹅輶 鄜, 孻憵 擙樲橚 藒襓謥 岯岪弨 蒮 廞徲 孻憵懥 趡趛踠 槏';
|
|
|
|
$tokens = ['鋍鞎', '鞮鞢騉', '袟袘觕', '炟砏', '謺貙蹖', '偢偣唲', '箷箯緷', '鑴鱱爧', '覮轀',
|
|
'剆坲', '煘煓瑐', '鬐鶤鶐', '飹勫嫢', '枲柊氠', '鍎鞚韕', '焲犈',
|
|
'殍涾烰', '齞齝囃', '蹅輶', '孻憵', '擙樲橚', '藒襓謥', '岯岪弨', '廞徲', '孻憵懥', '趡趛踠', ];
|
|
|
|
$this->assertEquals($tokens, $tokenizer->tokenize($text));
|
|
}
|
|
}
|