From fadd003169f490c9297174c9814c69e175e0ba6e Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Tue, 3 May 2016 00:33:18 +0200 Subject: [PATCH] create whitespace tokenizer --- src/Phpml/Tokenization/Tokenizer.php | 15 +++++++ .../Tokenization/WhitespaceTokenizer.php | 18 +++++++++ .../Tokenization/WhitespaceTokenizerTest.php | 40 +++++++++++++++++++ 3 files changed, 73 insertions(+) create mode 100644 src/Phpml/Tokenization/Tokenizer.php create mode 100644 src/Phpml/Tokenization/WhitespaceTokenizer.php create mode 100644 tests/Phpml/Tokenization/WhitespaceTokenizerTest.php diff --git a/src/Phpml/Tokenization/Tokenizer.php b/src/Phpml/Tokenization/Tokenizer.php new file mode 100644 index 0000000..5539a85 --- /dev/null +++ b/src/Phpml/Tokenization/Tokenizer.php @@ -0,0 +1,15 @@ +assertEquals($tokens, $tokenizer->tokenize($text)); + } + + public function testTokenizationOnUtf8() + { + $tokenizer = new WhitespaceTokenizer(); + + $text = '鋍鞎 鳼 鞮鞢騉 袟袘觕, 炟砏 蒮 謺貙蹖 偢偣唲 蒛 箷箯緷 鑴鱱爧 覮轀, + 剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢 銪 餀 枲柊氠 鍎鞚韕 焲犈, + 殍涾烰 齞齝囃 蹅輶 鄜, 孻憵 擙樲橚 藒襓謥 岯岪弨 蒮 廞徲 孻憵懥 趡趛踠 槏'; + + $tokens = ['鋍鞎', '鳼', '鞮鞢騉', '袟袘觕,', '炟砏', '蒮', '謺貙蹖', '偢偣唲', '蒛', '箷箯緷', '鑴鱱爧', '覮轀,', + '剆坲', '煘煓瑐', '鬐鶤鶐', '飹勫嫢', '銪', '餀', '枲柊氠', '鍎鞚韕', '焲犈,', + '殍涾烰', '齞齝囃', '蹅輶', '鄜,', '孻憵', '擙樲橚', '藒襓謥', '岯岪弨', '蒮', '廞徲', '孻憵懥', '趡趛踠', '槏', ]; + + $this->assertEquals($tokens, $tokenizer->tokenize($text)); + } +}