2016-05-02 22:33:18 +00:00
|
|
|
<?php
|
|
|
|
|
2016-11-20 21:53:17 +00:00
|
|
|
declare(strict_types=1);
|
2016-05-02 22:33:18 +00:00
|
|
|
|
|
|
|
namespace Phpml\Tokenization;
|
|
|
|
|
2018-10-28 06:44:52 +00:00
|
|
|
use Phpml\Exception\InvalidArgumentException;
|
|
|
|
|
2016-05-02 22:33:18 +00:00
|
|
|
class WhitespaceTokenizer implements Tokenizer
|
|
|
|
{
|
2017-11-22 21:16:10 +00:00
|
|
|
public function tokenize(string $text): array
|
2016-05-02 22:33:18 +00:00
|
|
|
{
|
2018-10-28 06:44:52 +00:00
|
|
|
$substrings = preg_split('/[\pZ\pC]+/u', $text, -1, PREG_SPLIT_NO_EMPTY);
|
|
|
|
if ($substrings === false) {
|
|
|
|
throw new InvalidArgumentException('preg_split failed on: '.$text);
|
|
|
|
}
|
|
|
|
|
|
|
|
return $substrings;
|
2016-05-02 22:33:18 +00:00
|
|
|
}
|
|
|
|
}
|