php-ml/tests/FeatureExtraction/TokenCountVectorizerTest.php

<?php

declare(strict_types=1);

namespace Phpml\Tests\FeatureExtraction;

use Phpml\FeatureExtraction\StopWords;
use Phpml\FeatureExtraction\TokenCountVectorizer;
use Phpml\Tokenization\WhitespaceTokenizer;
use PHPUnit\Framework\TestCase;

class TokenCountVectorizerTest extends TestCase
{
    public function testTransformationWithWhitespaceTokenizer(): void
    {
        $samples = [
            'Lorem ipsum dolor sit amet dolor',
            'Mauris placerat ipsum dolor',
            'Mauris diam eros fringilla diam',
        ];

        $vocabulary = [
            0 => 'Lorem',
            1 => 'ipsum',
            2 => 'dolor',
            3 => 'sit',
            4 => 'amet',
            5 => 'Mauris',
            6 => 'placerat',
            7 => 'diam',
            8 => 'eros',
            9 => 'fringilla',
        ];

        $tokensCounts = [
            [
                0 => 1,
                1 => 1,
                2 => 2,
                3 => 1,
                4 => 1,
                5 => 0,
                6 => 0,
                7 => 0,
                8 => 0,
                9 => 0,
            ],
            [
                0 => 0,
                1 => 1,
                2 => 1,
                3 => 0,
                4 => 0,
                5 => 1,
                6 => 1,
                7 => 0,
                8 => 0,
                9 => 0,
            ],
            [
                0 => 0,
                1 => 0,
                2 => 0,
                3 => 0,
                4 => 0,
                5 => 1,
                6 => 0,
                7 => 2,
                8 => 1,
                9 => 1,
            ],
        ];

        $vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer());

        $vectorizer->fit($samples);
        self::assertSame($vocabulary, $vectorizer->getVocabulary());

        $vectorizer->transform($samples);
        self::assertSame($tokensCounts, $samples);
    }

    public function testTransformationWithMinimumDocumentTokenCountFrequency(): void
    {
        // word at least in half samples
        $samples = [
            'Lorem ipsum dolor sit amet',
            'Lorem ipsum sit amet',
            'ipsum sit amet',
            'ipsum sit amet',
        ];

        $vocabulary = [
            0 => 'Lorem',
            1 => 'ipsum',
            2 => 'dolor',
            3 => 'sit',
            4 => 'amet',
        ];

        $tokensCounts = [
            [
                0 => 1,
                1 => 1,
                2 => 0,
                3 => 1,
                4 => 1,
            ],
            [
                0 => 1,
                1 => 1,
                2 => 0,
                3 => 1,
                4 => 1,
            ],
            [
                0 => 0,
                1 => 1,
                2 => 0,
                3 => 1,
                4 => 1,
            ],
            [
                0 => 0,
                1 => 1,
                2 => 0,
                3 => 1,
                4 => 1,
            ],
        ];

        $vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), null, 0.5);

        $vectorizer->fit($samples);
        self::assertSame($vocabulary, $vectorizer->getVocabulary());

        $vectorizer->transform($samples);
        self::assertSame($tokensCounts, $samples);

        // word at least once in all samples
        $samples = [
            'Lorem ipsum dolor sit amet',
            'Morbi quis sagittis Lorem',
            'eros Lorem',
        ];

        $tokensCounts = [
            [
                0 => 1,
                1 => 0,
                2 => 0,
                3 => 0,
                4 => 0,
                5 => 0,
                6 => 0,
                7 => 0,
                8 => 0,
            ],
            [
                0 => 1,
                1 => 0,
                2 => 0,
                3 => 0,
                4 => 0,
                5 => 0,
                6 => 0,
                7 => 0,
                8 => 0,
            ],
            [
                0 => 1,
                1 => 0,
                2 => 0,
                3 => 0,
                4 => 0,
                5 => 0,
                6 => 0,
                7 => 0,
                8 => 0,
            ],
        ];

        $vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), null, 1);
        $vectorizer->fit($samples);
        $vectorizer->transform($samples);

        self::assertSame($tokensCounts, $samples);
    }

    public function testTransformationWithStopWords(): void
    {
        $samples = [
            'Lorem ipsum dolor sit amet dolor',
            'Mauris placerat ipsum dolor',
            'Mauris diam eros fringilla diam',
        ];

        $stopWords = new StopWords(['dolor', 'diam']);

        $vocabulary = [
            0 => 'Lorem',
            1 => 'ipsum',
            //2 => 'dolor',
            2 => 'sit',
            3 => 'amet',
            4 => 'Mauris',
            5 => 'placerat',
            //7 => 'diam',
            6 => 'eros',
            7 => 'fringilla',
        ];

        $tokensCounts = [
            [
                0 => 1,
                1 => 1,
                2 => 1,
                3 => 1,
                4 => 0,
                5 => 0,
                6 => 0,
                7 => 0,
            ],
            [
                0 => 0,
                1 => 1,
                2 => 0,
                3 => 0,
                4 => 1,
                5 => 1,
                6 => 0,
                7 => 0,
            ],
            [
                0 => 0,
                1 => 0,
                2 => 0,
                3 => 0,
                4 => 1,
                5 => 0,
                6 => 1,
                7 => 1,
            ],
        ];

        $vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), $stopWords);

        $vectorizer->fit($samples);
        self::assertSame($vocabulary, $vectorizer->getVocabulary());

        $vectorizer->transform($samples);
        self::assertSame($tokensCounts, $samples);
    }
}
feature extractions tools - TokenCountVectorizez 2016-05-03 21:28:29 +00:00			`<?php`

Update php-cs-fixer 2016-11-20 21:53:17 +00:00			`declare(strict_types=1);`
feature extractions tools - TokenCountVectorizez 2016-05-03 21:28:29 +00:00
Add PHPStan and level to max (#168) * tests: update to PHPUnit 6.0 with rector * fix namespaces on tests * composer + tests: use standard test namespace naming * update travis * resolve conflict * phpstan lvl 2 * phpstan lvl 3 * phpstan lvl 4 * phpstan lvl 5 * phpstan lvl 6 * phpstan lvl 7 * level max * resolve conflict * [cs] clean empty docs * composer: bump to PHPUnit 6.4 * cleanup * composer + travis: add phpstan * phpstan lvl 1 * composer: update dev deps * phpstan fixes * update Contributing with new tools * docs: link fixes, PHP version update * composer: drop php-cs-fixer, cs already handled by ecs * ecs: add old set rules * [cs] apply rest of rules 2018-01-06 12:09:33 +00:00			`namespace Phpml\Tests\FeatureExtraction;`
feature extractions tools - TokenCountVectorizez 2016-05-03 21:28:29 +00:00
implement StopWords in TokenCountVectorizer 2016-07-06 21:22:29 +00:00			`use Phpml\FeatureExtraction\StopWords;`
feature extractions tools - TokenCountVectorizez 2016-05-03 21:28:29 +00:00			`use Phpml\FeatureExtraction\TokenCountVectorizer;`
			`use Phpml\Tokenization\WhitespaceTokenizer;`
Update phpunit to 6.0 2017-02-03 11:58:25 +00:00			`use PHPUnit\Framework\TestCase;`
feature extractions tools - TokenCountVectorizez 2016-05-03 21:28:29 +00:00
Update phpunit to 6.0 2017-02-03 11:58:25 +00:00			`class TokenCountVectorizerTest extends TestCase`
feature extractions tools - TokenCountVectorizez 2016-05-03 21:28:29 +00:00			`{`
Upgrade to PHP 7.1 (#150) * upgrade to PHP 7.1 * bump travis and composer to PHP 7.1 * fix tests 2017-11-14 20:21:23 +00:00			`public function testTransformationWithWhitespaceTokenizer(): void`
feature extractions tools - TokenCountVectorizez 2016-05-03 21:28:29 +00:00			`{`
			`$samples = [`
			`'Lorem ipsum dolor sit amet dolor',`
			`'Mauris placerat ipsum dolor',`
			`'Mauris diam eros fringilla diam',`
			`];`

change token count vectorizer to return full token counts 2016-06-14 07:58:11 +00:00			`$vocabulary = [`
			`0 => 'Lorem',`
			`1 => 'ipsum',`
			`2 => 'dolor',`
			`3 => 'sit',`
			`4 => 'amet',`
			`5 => 'Mauris',`
			`6 => 'placerat',`
			`7 => 'diam',`
			`8 => 'eros',`
			`9 => 'fringilla',`
			`];`

			`$tokensCounts = [`
Added EasyCodingStandard + lots of code fixes (#156) * travis: move coveralls here, decouple from package * composer: use PSR4 * phpunit: simpler config * travis: add ecs run * composer: add ecs dev * use standard vendor/bin directory for dependency bins, confuses with local bins and require gitignore handling * ecs: add PSR2 * [cs] PSR2 spacing fixes * [cs] PSR2 class name fix * [cs] PHP7 fixes - return semicolon spaces, old rand functions, typehints * [cs] fix less strict typehints * fix typehints to make tests pass * ecs: ignore typehint-less elements * [cs] standardize arrays * [cs] standardize docblock, remove unused comments * [cs] use self where possible * [cs] sort class elements, from public to private * [cs] do not use yoda (found less yoda-cases, than non-yoda) * space * [cs] do not assign in condition * [cs] use namespace imports if possible * [cs] use ::class over strings * [cs] fix defaults for arrays properties, properties and constants single spacing * cleanup ecs comments * [cs] use item per line in multi-items array * missing line * misc * rebase 2017-11-22 21:16:10 +00:00			`[`
			`0 => 1,`
			`1 => 1,`
			`2 => 2,`
			`3 => 1,`
			`4 => 1,`
			`5 => 0,`
			`6 => 0,`
			`7 => 0,`
			`8 => 0,`
			`9 => 0,`
			`],`
			`[`
			`0 => 0,`
			`1 => 1,`
			`2 => 1,`
			`3 => 0,`
			`4 => 0,`
			`5 => 1,`
			`6 => 1,`
			`7 => 0,`
			`8 => 0,`
			`9 => 0,`
			`],`
			`[`
			`0 => 0,`
			`1 => 0,`
			`2 => 0,`
			`3 => 0,`
			`4 => 0,`
			`5 => 1,`
			`6 => 0,`
			`7 => 2,`
			`8 => 1,`
			`9 => 1,`
			`],`
feature extractions tools - TokenCountVectorizez 2016-05-03 21:28:29 +00:00			`];`

			`$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer());`

implement fit fot TokenCountVectorizer 2016-06-16 22:33:48 +00:00			`$vectorizer->fit($samples);`
Update phpstan to 0.10.5 (#320) 2018-10-28 06:44:52 +00:00			`self::assertSame($vocabulary, $vectorizer->getVocabulary());`
implement fit fot TokenCountVectorizer 2016-06-16 22:33:48 +00:00
			`$vectorizer->transform($samples);`
Update phpstan to 0.10.5 (#320) 2018-10-28 06:44:52 +00:00			`self::assertSame($tokensCounts, $samples);`
feature extractions tools - TokenCountVectorizez 2016-05-03 21:28:29 +00:00			`}`

Upgrade to PHP 7.1 (#150) * upgrade to PHP 7.1 * bump travis and composer to PHP 7.1 * fix tests 2017-11-14 20:21:23 +00:00			`public function testTransformationWithMinimumDocumentTokenCountFrequency(): void`
feature extractions tools - TokenCountVectorizez 2016-05-03 21:28:29 +00:00			`{`
			`// word at least in half samples`
			`$samples = [`
			`'Lorem ipsum dolor sit amet',`
			`'Lorem ipsum sit amet',`
			`'ipsum sit amet',`
			`'ipsum sit amet',`
			`];`

change token count vectorizer to return full token counts 2016-06-14 07:58:11 +00:00			`$vocabulary = [`
			`0 => 'Lorem',`
			`1 => 'ipsum',`
			`2 => 'dolor',`
			`3 => 'sit',`
			`4 => 'amet',`
			`];`

			`$tokensCounts = [`
Added EasyCodingStandard + lots of code fixes (#156) * travis: move coveralls here, decouple from package * composer: use PSR4 * phpunit: simpler config * travis: add ecs run * composer: add ecs dev * use standard vendor/bin directory for dependency bins, confuses with local bins and require gitignore handling * ecs: add PSR2 * [cs] PSR2 spacing fixes * [cs] PSR2 class name fix * [cs] PHP7 fixes - return semicolon spaces, old rand functions, typehints * [cs] fix less strict typehints * fix typehints to make tests pass * ecs: ignore typehint-less elements * [cs] standardize arrays * [cs] standardize docblock, remove unused comments * [cs] use self where possible * [cs] sort class elements, from public to private * [cs] do not use yoda (found less yoda-cases, than non-yoda) * space * [cs] do not assign in condition * [cs] use namespace imports if possible * [cs] use ::class over strings * [cs] fix defaults for arrays properties, properties and constants single spacing * cleanup ecs comments * [cs] use item per line in multi-items array * missing line * misc * rebase 2017-11-22 21:16:10 +00:00			`[`
			`0 => 1,`
			`1 => 1,`
			`2 => 0,`
			`3 => 1,`
			`4 => 1,`
			`],`
			`[`
			`0 => 1,`
			`1 => 1,`
			`2 => 0,`
			`3 => 1,`
			`4 => 1,`
			`],`
			`[`
			`0 => 0,`
			`1 => 1,`
			`2 => 0,`
			`3 => 1,`
			`4 => 1,`
			`],`
			`[`
			`0 => 0,`
			`1 => 1,`
			`2 => 0,`
			`3 => 1,`
			`4 => 1,`
			`],`
feature extractions tools - TokenCountVectorizez 2016-05-03 21:28:29 +00:00			`];`

implement StopWords in TokenCountVectorizer 2016-07-06 21:22:29 +00:00			`$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), null, 0.5);`
feature extractions tools - TokenCountVectorizez 2016-05-03 21:28:29 +00:00
implement fit fot TokenCountVectorizer 2016-06-16 22:33:48 +00:00			`$vectorizer->fit($samples);`
Update phpstan to 0.10.5 (#320) 2018-10-28 06:44:52 +00:00			`self::assertSame($vocabulary, $vectorizer->getVocabulary());`
feature extractions tools - TokenCountVectorizez 2016-05-03 21:28:29 +00:00
implement fit fot TokenCountVectorizer 2016-06-16 22:33:48 +00:00			`$vectorizer->transform($samples);`
Update phpstan to 0.10.5 (#320) 2018-10-28 06:44:52 +00:00			`self::assertSame($tokensCounts, $samples);`
implement fit fot TokenCountVectorizer 2016-06-16 22:33:48 +00:00
change token count vectorizer to return full token counts 2016-06-14 07:58:11 +00:00			`// word at least once in all samples`
feature extractions tools - TokenCountVectorizez 2016-05-03 21:28:29 +00:00			`$samples = [`
			`'Lorem ipsum dolor sit amet',`
change token count vectorizer to return full token counts 2016-06-14 07:58:11 +00:00			`'Morbi quis sagittis Lorem',`
			`'eros Lorem',`
feature extractions tools - TokenCountVectorizez 2016-05-03 21:28:29 +00:00			`];`

change token count vectorizer to return full token counts 2016-06-14 07:58:11 +00:00			`$tokensCounts = [`
Added EasyCodingStandard + lots of code fixes (#156) * travis: move coveralls here, decouple from package * composer: use PSR4 * phpunit: simpler config * travis: add ecs run * composer: add ecs dev * use standard vendor/bin directory for dependency bins, confuses with local bins and require gitignore handling * ecs: add PSR2 * [cs] PSR2 spacing fixes * [cs] PSR2 class name fix * [cs] PHP7 fixes - return semicolon spaces, old rand functions, typehints * [cs] fix less strict typehints * fix typehints to make tests pass * ecs: ignore typehint-less elements * [cs] standardize arrays * [cs] standardize docblock, remove unused comments * [cs] use self where possible * [cs] sort class elements, from public to private * [cs] do not use yoda (found less yoda-cases, than non-yoda) * space * [cs] do not assign in condition * [cs] use namespace imports if possible * [cs] use ::class over strings * [cs] fix defaults for arrays properties, properties and constants single spacing * cleanup ecs comments * [cs] use item per line in multi-items array * missing line * misc * rebase 2017-11-22 21:16:10 +00:00			`[`
			`0 => 1,`
			`1 => 0,`
			`2 => 0,`
			`3 => 0,`
			`4 => 0,`
			`5 => 0,`
			`6 => 0,`
			`7 => 0,`
			`8 => 0,`
			`],`
			`[`
			`0 => 1,`
			`1 => 0,`
			`2 => 0,`
			`3 => 0,`
			`4 => 0,`
			`5 => 0,`
			`6 => 0,`
			`7 => 0,`
			`8 => 0,`
			`],`
			`[`
			`0 => 1,`
			`1 => 0,`
			`2 => 0,`
			`3 => 0,`
			`4 => 0,`
			`5 => 0,`
			`6 => 0,`
			`7 => 0,`
			`8 => 0,`
			`],`
feature extractions tools - TokenCountVectorizez 2016-05-03 21:28:29 +00:00			`];`

implement StopWords in TokenCountVectorizer 2016-07-06 21:22:29 +00:00			`$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), null, 1);`
implement fit fot TokenCountVectorizer 2016-06-16 22:33:48 +00:00			`$vectorizer->fit($samples);`
change transformer behavior to reference 2016-06-16 08:01:40 +00:00			`$vectorizer->transform($samples);`
feature extractions tools - TokenCountVectorizez 2016-05-03 21:28:29 +00:00
Update phpstan to 0.10.5 (#320) 2018-10-28 06:44:52 +00:00			`self::assertSame($tokensCounts, $samples);`
feature extractions tools - TokenCountVectorizez 2016-05-03 21:28:29 +00:00			`}`
implement StopWords in TokenCountVectorizer 2016-07-06 21:22:29 +00:00
Upgrade to PHP 7.1 (#150) * upgrade to PHP 7.1 * bump travis and composer to PHP 7.1 * fix tests 2017-11-14 20:21:23 +00:00			`public function testTransformationWithStopWords(): void`
implement StopWords in TokenCountVectorizer 2016-07-06 21:22:29 +00:00			`{`
			`$samples = [`
			`'Lorem ipsum dolor sit amet dolor',`
			`'Mauris placerat ipsum dolor',`
			`'Mauris diam eros fringilla diam',`
			`];`

			`$stopWords = new StopWords(['dolor', 'diam']);`

			`$vocabulary = [`
			`0 => 'Lorem',`
			`1 => 'ipsum',`
			`//2 => 'dolor',`
			`2 => 'sit',`
			`3 => 'amet',`
			`4 => 'Mauris',`
			`5 => 'placerat',`
			`//7 => 'diam',`
			`6 => 'eros',`
			`7 => 'fringilla',`
			`];`

			`$tokensCounts = [`
Added EasyCodingStandard + lots of code fixes (#156) * travis: move coveralls here, decouple from package * composer: use PSR4 * phpunit: simpler config * travis: add ecs run * composer: add ecs dev * use standard vendor/bin directory for dependency bins, confuses with local bins and require gitignore handling * ecs: add PSR2 * [cs] PSR2 spacing fixes * [cs] PSR2 class name fix * [cs] PHP7 fixes - return semicolon spaces, old rand functions, typehints * [cs] fix less strict typehints * fix typehints to make tests pass * ecs: ignore typehint-less elements * [cs] standardize arrays * [cs] standardize docblock, remove unused comments * [cs] use self where possible * [cs] sort class elements, from public to private * [cs] do not use yoda (found less yoda-cases, than non-yoda) * space * [cs] do not assign in condition * [cs] use namespace imports if possible * [cs] use ::class over strings * [cs] fix defaults for arrays properties, properties and constants single spacing * cleanup ecs comments * [cs] use item per line in multi-items array * missing line * misc * rebase 2017-11-22 21:16:10 +00:00			`[`
			`0 => 1,`
			`1 => 1,`
			`2 => 1,`
			`3 => 1,`
			`4 => 0,`
			`5 => 0,`
			`6 => 0,`
			`7 => 0,`
			`],`
			`[`
			`0 => 0,`
			`1 => 1,`
			`2 => 0,`
			`3 => 0,`
			`4 => 1,`
			`5 => 1,`
			`6 => 0,`
			`7 => 0,`
			`],`
			`[`
			`0 => 0,`
			`1 => 0,`
			`2 => 0,`
			`3 => 0,`
			`4 => 1,`
			`5 => 0,`
			`6 => 1,`
			`7 => 1,`
			`],`
implement StopWords in TokenCountVectorizer 2016-07-06 21:22:29 +00:00			`];`

			`$vectorizer = new TokenCountVectorizer(new WhitespaceTokenizer(), $stopWords);`

			`$vectorizer->fit($samples);`
Update phpstan to 0.10.5 (#320) 2018-10-28 06:44:52 +00:00			`self::assertSame($vocabulary, $vectorizer->getVocabulary());`
implement StopWords in TokenCountVectorizer 2016-07-06 21:22:29 +00:00
			`$vectorizer->transform($samples);`
Update phpstan to 0.10.5 (#320) 2018-10-28 06:44:52 +00:00			`self::assertSame($tokensCounts, $samples);`
implement StopWords in TokenCountVectorizer 2016-07-06 21:22:29 +00:00			`}`
feature extractions tools - TokenCountVectorizez 2016-05-03 21:28:29 +00:00			`}`