php-ml/src/Tokenization/WhitespaceTokenizer.php

21 lines
462 B
PHP
Raw Normal View History

2016-05-02 22:33:18 +00:00
<?php
2016-11-20 21:53:17 +00:00
declare(strict_types=1);
2016-05-02 22:33:18 +00:00
namespace Phpml\Tokenization;
2018-10-28 06:44:52 +00:00
use Phpml\Exception\InvalidArgumentException;
2016-05-02 22:33:18 +00:00
class WhitespaceTokenizer implements Tokenizer
{
public function tokenize(string $text): array
2016-05-02 22:33:18 +00:00
{
2018-10-28 06:44:52 +00:00
$substrings = preg_split('/[\pZ\pC]+/u', $text, -1, PREG_SPLIT_NO_EMPTY);
if ($substrings === false) {
throw new InvalidArgumentException('preg_split failed on: '.$text);
}
return $substrings;
2016-05-02 22:33:18 +00:00
}
}