php-ml/tests/Tokenization/WhitespaceTokenizerTest.php

42 lines
1.7 KiB
PHP
Raw Normal View History

2016-05-03 00:33:18 +02:00
<?php
2016-11-20 22:53:17 +01:00
declare(strict_types=1);
2016-05-03 00:33:18 +02:00
namespace Phpml\Tests\Tokenization;
2016-05-03 00:33:18 +02:00
use Phpml\Tokenization\WhitespaceTokenizer;
2017-02-03 12:58:25 +01:00
use PHPUnit\Framework\TestCase;
2016-05-03 00:33:18 +02:00
2017-02-03 12:58:25 +01:00
class WhitespaceTokenizerTest extends TestCase
2016-05-03 00:33:18 +02:00
{
public function testTokenizationOnAscii(): void
2016-05-03 00:33:18 +02:00
{
$tokenizer = new WhitespaceTokenizer();
$text = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Cras consectetur, dui et lobortis auctor.
Nulla vitae congue lorem.';
$tokens = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet,', 'consectetur', 'adipiscing', 'elit.',
'Cras', 'consectetur,', 'dui', 'et', 'lobortis', 'auctor.',
'Nulla', 'vitae', 'congue', 'lorem.', ];
2016-05-03 00:33:18 +02:00
2018-10-28 07:44:52 +01:00
self::assertEquals($tokens, $tokenizer->tokenize($text));
2016-05-03 00:33:18 +02:00
}
public function testTokenizationOnUtf8(): void
2016-05-03 00:33:18 +02:00
{
$tokenizer = new WhitespaceTokenizer();
$text = '鋍鞎 鞮鞢騉 袟袘觕, 炟砏 謺貙蹖 偢偣唲 箷箯緷 鑴鱱爧 覮轀,
剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢 枲柊氠 鍎鞚韕 焲犈,
殍涾烰 齞齝囃 蹅輶 , 孻憵 擙樲橚 藒襓謥 岯岪弨 廞徲 孻憵懥 趡趛踠 ';
$tokens = ['鋍鞎', '鳼', '鞮鞢騉', '袟袘觕,', '炟砏', '蒮', '謺貙蹖', '偢偣唲', '蒛', '箷箯緷', '鑴鱱爧', '覮轀,',
'剆坲', '煘煓瑐', '鬐鶤鶐', '飹勫嫢', '銪', '餀', '枲柊氠', '鍎鞚韕', '焲犈,',
'殍涾烰', '齞齝囃', '蹅輶', '鄜,', '孻憵', '擙樲橚', '藒襓謥', '岯岪弨', '蒮', '廞徲', '孻憵懥', '趡趛踠', '槏', ];
2016-05-03 00:33:18 +02:00
2018-10-28 07:44:52 +01:00
self::assertEquals($tokens, $tokenizer->tokenize($text));
2016-05-03 00:33:18 +02:00
}
}