From 46197eba7b53c9eb48d16b3420ca0b091aaf0a31 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sat, 7 May 2016 23:17:52 +0200 Subject: [PATCH] add word tokenizer --- src/Phpml/Tokenization/WordTokenizer.php | 21 ++++++++++ .../Phpml/Tokenization/WordTokenizerTest.php | 40 +++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 src/Phpml/Tokenization/WordTokenizer.php create mode 100644 tests/Phpml/Tokenization/WordTokenizerTest.php diff --git a/src/Phpml/Tokenization/WordTokenizer.php b/src/Phpml/Tokenization/WordTokenizer.php new file mode 100644 index 0000000..c384c39 --- /dev/null +++ b/src/Phpml/Tokenization/WordTokenizer.php @@ -0,0 +1,21 @@ +assertEquals($tokens, $tokenizer->tokenize($text)); + } + + public function testTokenizationOnUtf8() + { + $tokenizer = new WordTokenizer(); + + $text = '鋍鞎 鳼 鞮鞢騉 袟袘觕, 炟砏 蒮 謺貙蹖 偢偣唲 蒛 箷箯緷 鑴鱱爧 覮轀, + 剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢 銪 餀 枲柊氠 鍎鞚韕 焲犈, + 殍涾烰 齞齝囃 蹅輶 鄜, 孻憵 擙樲橚 藒襓謥 岯岪弨 蒮 廞徲 孻憵懥 趡趛踠 槏'; + + $tokens = ['鋍鞎', '鞮鞢騉', '袟袘觕', '炟砏', '謺貙蹖', '偢偣唲', '箷箯緷', '鑴鱱爧', '覮轀', + '剆坲', '煘煓瑐', '鬐鶤鶐', '飹勫嫢', '枲柊氠', '鍎鞚韕', '焲犈', + '殍涾烰', '齞齝囃', '蹅輶', '孻憵', '擙樲橚', '藒襓謥', '岯岪弨', '廞徲', '孻憵懥', '趡趛踠', ]; + + $this->assertEquals($tokens, $tokenizer->tokenize($text)); + } +}