2016-05-03 21:28:29 +00:00
|
|
|
<?php
|
|
|
|
|
2016-11-20 21:53:17 +00:00
|
|
|
declare(strict_types=1);
|
2016-05-03 21:28:29 +00:00
|
|
|
|
2018-01-06 12:09:33 +00:00
|
|
|
namespace Phpml\Tests\FeatureExtraction;
|
2016-05-03 21:28:29 +00:00
|
|
|
|
2016-07-06 21:22:29 +00:00
|
|
|
use Phpml\FeatureExtraction\StopWords;
|
2016-05-03 21:28:29 +00:00
|
|
|
use Phpml\FeatureExtraction\TokenCountVectorizer;
|
|
|
|
use Phpml\Tokenization\WhitespaceTokenizer;
|
2017-02-03 11:58:25 +00:00
|
|
|
use PHPUnit\Framework\TestCase;
|
2016-05-03 21:28:29 +00:00
|
|
|
|
2017-02-03 11:58:25 +00:00
|
|
|
class TokenCountVectorizerTest extends TestCase
|
2016-05-03 21:28:29 +00:00
|
|
|
{
|
2017-11-14 20:21:23 +00:00
|
|
|
public function testTransformationWithWhitespaceTokenizer(): void
|
2016-05-03 21:28:29 +00:00
|
|
|
{
|
|
|
|
$samples = [
|
|
|
|
'Lorem ipsum dolor sit amet dolor',
|
|
|
|
'Mauris placerat ipsum dolor',
|
|
|
|
'Mauris diam eros fringilla diam',
|
|
|
|
];
|
|
|
|
|
2016-06-14 07:58:11 +00:00
|
|
|
$vocabulary = [
|
|
|
|
0 => 'Lorem',
|
|
|
|
1 => 'ipsum',
|
|
|
|
2 => 'dolor',
|
|
|
|
3 => 'sit',
|
|
|
|
4 => 'amet',
|
|
|
|
5 => 'Mauris',
|
|
|
|
6 => 'placerat',
|
|
|
|
7 => 'diam',
|
|
|
|
8 => 'eros',
|
|
|
|
9 => 'fringilla',
|
|
|
|
];
|
|
|
|
|
|
|
|
$tokensCounts = [
|
2017-11-22 21:16:10 +00:00
|
|
|
[
|
|
|
|
0 => 1,
|
|
|
|
1 => 1,
|
|
|
|
2 => 2,
|
|
|
|
3 => 1,
|
|
|
|
4 => 1,
|
|
|
|
5 => 0,
|
|
|
|
6 => 0,
|
|
|
|
7 => 0,
|
|
|
|
8 => 0,
|
|
|
|
9 => 0,
|
|
|
|
],
|
|
|
|
[
|
|
|
|
0 => 0,
|
|
|
|
1 => 1,
|
|
|
|
2 => 1,
|
|
|
|
3 => 0,
|
|
|
|
4 => 0,
|
|
|
|
5 => 1,
|
|
|
|
6 => 1,
|
|
|
|
7 => 0,
|
|
|
|
8 => 0,
|
|
|
|
9 => 0,
|
|
|
|
],
|
|
|
|
[
|
|
|
|
0 => 0,
|
|
|
|
1 => 0,
|
|
|
|
2 => 0,
|
|
|
|
3 => 0,
|
|
|
|
4 => 0,
|
|
|
|
5 => 1,
|
|
|
|
6 => 0,
|
|
|
|
7 => 2,
|
|
|
|
8 => 1,
|
|
|
|
9 => 1,
|
|
|
|
],
|
2016-05-03 21:28:29 +00:00
|
|
|
];
|
|
|
|
|
|
|
|
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer());
|
|
|
|
|
2016-06-16 22:33:48 +00:00
|
|
|
$vectorizer->fit($samples);
|
2018-10-28 06:44:52 +00:00
|
|
|
self::assertSame($vocabulary, $vectorizer->getVocabulary());
|
2016-06-16 22:33:48 +00:00
|
|
|
|
|
|
|
$vectorizer->transform($samples);
|
2018-10-28 06:44:52 +00:00
|
|
|
self::assertSame($tokensCounts, $samples);
|
2016-05-03 21:28:29 +00:00
|
|
|
}
|
|
|
|
|
2017-11-14 20:21:23 +00:00
|
|
|
public function testTransformationWithMinimumDocumentTokenCountFrequency(): void
|
2016-05-03 21:28:29 +00:00
|
|
|
{
|
|
|
|
// word at least in half samples
|
|
|
|
$samples = [
|
2019-03-20 22:22:45 +00:00
|
|
|
'Lorem ipsum dolor sit amet 1550',
|
2016-05-03 21:28:29 +00:00
|
|
|
'Lorem ipsum sit amet',
|
|
|
|
'ipsum sit amet',
|
|
|
|
'ipsum sit amet',
|
|
|
|
];
|
|
|
|
|
2016-06-14 07:58:11 +00:00
|
|
|
$vocabulary = [
|
|
|
|
0 => 'Lorem',
|
|
|
|
1 => 'ipsum',
|
|
|
|
2 => 'dolor',
|
|
|
|
3 => 'sit',
|
|
|
|
4 => 'amet',
|
2019-03-20 22:22:45 +00:00
|
|
|
5 => 1550,
|
2016-06-14 07:58:11 +00:00
|
|
|
];
|
|
|
|
|
|
|
|
$tokensCounts = [
|
2017-11-22 21:16:10 +00:00
|
|
|
[
|
|
|
|
0 => 1,
|
|
|
|
1 => 1,
|
|
|
|
2 => 0,
|
|
|
|
3 => 1,
|
|
|
|
4 => 1,
|
2019-03-20 22:22:45 +00:00
|
|
|
5 => 0,
|
2017-11-22 21:16:10 +00:00
|
|
|
],
|
|
|
|
[
|
|
|
|
0 => 1,
|
|
|
|
1 => 1,
|
|
|
|
2 => 0,
|
|
|
|
3 => 1,
|
|
|
|
4 => 1,
|
2019-03-20 22:22:45 +00:00
|
|
|
5 => 0,
|
2017-11-22 21:16:10 +00:00
|
|
|
],
|
|
|
|
[
|
|
|
|
0 => 0,
|
|
|
|
1 => 1,
|
|
|
|
2 => 0,
|
|
|
|
3 => 1,
|
|
|
|
4 => 1,
|
2019-03-20 22:22:45 +00:00
|
|
|
5 => 0,
|
2017-11-22 21:16:10 +00:00
|
|
|
],
|
|
|
|
[
|
|
|
|
0 => 0,
|
|
|
|
1 => 1,
|
|
|
|
2 => 0,
|
|
|
|
3 => 1,
|
|
|
|
4 => 1,
|
2019-03-20 22:22:45 +00:00
|
|
|
5 => 0,
|
2017-11-22 21:16:10 +00:00
|
|
|
],
|
2016-05-03 21:28:29 +00:00
|
|
|
];
|
|
|
|
|
2016-07-06 21:22:29 +00:00
|
|
|
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), null, 0.5);
|
2016-05-03 21:28:29 +00:00
|
|
|
|
2016-06-16 22:33:48 +00:00
|
|
|
$vectorizer->fit($samples);
|
2018-10-28 06:44:52 +00:00
|
|
|
self::assertSame($vocabulary, $vectorizer->getVocabulary());
|
2016-05-03 21:28:29 +00:00
|
|
|
|
2016-06-16 22:33:48 +00:00
|
|
|
$vectorizer->transform($samples);
|
2018-10-28 06:44:52 +00:00
|
|
|
self::assertSame($tokensCounts, $samples);
|
2016-06-16 22:33:48 +00:00
|
|
|
|
2016-06-14 07:58:11 +00:00
|
|
|
// word at least once in all samples
|
2016-05-03 21:28:29 +00:00
|
|
|
$samples = [
|
|
|
|
'Lorem ipsum dolor sit amet',
|
2016-06-14 07:58:11 +00:00
|
|
|
'Morbi quis sagittis Lorem',
|
|
|
|
'eros Lorem',
|
2016-05-03 21:28:29 +00:00
|
|
|
];
|
|
|
|
|
2016-06-14 07:58:11 +00:00
|
|
|
$tokensCounts = [
|
2017-11-22 21:16:10 +00:00
|
|
|
[
|
|
|
|
0 => 1,
|
|
|
|
1 => 0,
|
|
|
|
2 => 0,
|
|
|
|
3 => 0,
|
|
|
|
4 => 0,
|
|
|
|
5 => 0,
|
|
|
|
6 => 0,
|
|
|
|
7 => 0,
|
|
|
|
8 => 0,
|
|
|
|
],
|
|
|
|
[
|
|
|
|
0 => 1,
|
|
|
|
1 => 0,
|
|
|
|
2 => 0,
|
|
|
|
3 => 0,
|
|
|
|
4 => 0,
|
|
|
|
5 => 0,
|
|
|
|
6 => 0,
|
|
|
|
7 => 0,
|
|
|
|
8 => 0,
|
|
|
|
],
|
|
|
|
[
|
|
|
|
0 => 1,
|
|
|
|
1 => 0,
|
|
|
|
2 => 0,
|
|
|
|
3 => 0,
|
|
|
|
4 => 0,
|
|
|
|
5 => 0,
|
|
|
|
6 => 0,
|
|
|
|
7 => 0,
|
|
|
|
8 => 0,
|
|
|
|
],
|
2016-05-03 21:28:29 +00:00
|
|
|
];
|
|
|
|
|
2016-07-06 21:22:29 +00:00
|
|
|
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), null, 1);
|
2016-06-16 22:33:48 +00:00
|
|
|
$vectorizer->fit($samples);
|
2016-06-16 08:01:40 +00:00
|
|
|
$vectorizer->transform($samples);
|
2016-05-03 21:28:29 +00:00
|
|
|
|
2018-10-28 06:44:52 +00:00
|
|
|
self::assertSame($tokensCounts, $samples);
|
2016-05-03 21:28:29 +00:00
|
|
|
}
|
2016-07-06 21:22:29 +00:00
|
|
|
|
2017-11-14 20:21:23 +00:00
|
|
|
public function testTransformationWithStopWords(): void
|
2016-07-06 21:22:29 +00:00
|
|
|
{
|
|
|
|
$samples = [
|
|
|
|
'Lorem ipsum dolor sit amet dolor',
|
|
|
|
'Mauris placerat ipsum dolor',
|
|
|
|
'Mauris diam eros fringilla diam',
|
|
|
|
];
|
|
|
|
|
|
|
|
$stopWords = new StopWords(['dolor', 'diam']);
|
|
|
|
|
|
|
|
$vocabulary = [
|
|
|
|
0 => 'Lorem',
|
|
|
|
1 => 'ipsum',
|
|
|
|
//2 => 'dolor',
|
|
|
|
2 => 'sit',
|
|
|
|
3 => 'amet',
|
|
|
|
4 => 'Mauris',
|
|
|
|
5 => 'placerat',
|
|
|
|
//7 => 'diam',
|
|
|
|
6 => 'eros',
|
|
|
|
7 => 'fringilla',
|
|
|
|
];
|
|
|
|
|
|
|
|
$tokensCounts = [
|
2017-11-22 21:16:10 +00:00
|
|
|
[
|
|
|
|
0 => 1,
|
|
|
|
1 => 1,
|
|
|
|
2 => 1,
|
|
|
|
3 => 1,
|
|
|
|
4 => 0,
|
|
|
|
5 => 0,
|
|
|
|
6 => 0,
|
|
|
|
7 => 0,
|
|
|
|
],
|
|
|
|
[
|
|
|
|
0 => 0,
|
|
|
|
1 => 1,
|
|
|
|
2 => 0,
|
|
|
|
3 => 0,
|
|
|
|
4 => 1,
|
|
|
|
5 => 1,
|
|
|
|
6 => 0,
|
|
|
|
7 => 0,
|
|
|
|
],
|
|
|
|
[
|
|
|
|
0 => 0,
|
|
|
|
1 => 0,
|
|
|
|
2 => 0,
|
|
|
|
3 => 0,
|
|
|
|
4 => 1,
|
|
|
|
5 => 0,
|
|
|
|
6 => 1,
|
|
|
|
7 => 1,
|
|
|
|
],
|
2016-07-06 21:22:29 +00:00
|
|
|
];
|
|
|
|
|
|
|
|
$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), $stopWords);
|
|
|
|
|
|
|
|
$vectorizer->fit($samples);
|
2018-10-28 06:44:52 +00:00
|
|
|
self::assertSame($vocabulary, $vectorizer->getVocabulary());
|
2016-07-06 21:22:29 +00:00
|
|
|
|
|
|
|
$vectorizer->transform($samples);
|
2018-10-28 06:44:52 +00:00
|
|
|
self::assertSame($tokensCounts, $samples);
|
2016-07-06 21:22:29 +00:00
|
|
|
}
|
2016-05-03 21:28:29 +00:00
|
|
|
}
|