Fix FilesDataset arrays and TokenCountVectorizer numeric token (#363)

This commit is contained in:
Arkadiusz Kondas 2019-03-20 23:22:45 +01:00 committed by GitHub
parent 02dab41830
commit 5e02b893e9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 23 additions and 5 deletions

View File

@ -4,6 +4,19 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [0.8.0] - 2019-03-20
### Added
- [Tokenization] Added NGramTokenizer (#350)
- editorconfig file (#355)
### Fixed
- [Dataset] FilesDataset read samples without additional array (#363)
- [Tokenization] fixed error with numeric token values (#363)
### Changed
- [Math] improved performance with pow and sqrt replacement (#350)
- [Math] reduce duplicated code in distance metrics (#348)
- update phpunit to 7.5.1 (#335)
- code style fixes (#334)
## [0.7.0] - 2018-11-07 ## [0.7.0] - 2018-11-07
### Added ### Added
- [Clustering] added KMeans associative clustering (#262) - [Clustering] added KMeans associative clustering (#262)

View File

@ -29,7 +29,7 @@ class FilesDataset extends ArrayDataset
$target = basename($dir); $target = basename($dir);
foreach (array_filter(glob($dir.DIRECTORY_SEPARATOR.'*'), 'is_file') as $file) { foreach (array_filter(glob($dir.DIRECTORY_SEPARATOR.'*'), 'is_file') as $file) {
$this->samples[] = [file_get_contents($file)]; $this->samples[] = file_get_contents($file);
$this->targets[] = $target; $this->targets[] = $target;
} }
} }

View File

@ -157,7 +157,7 @@ class TokenCountVectorizer implements Transformer
$indexes = []; $indexes = [];
foreach ($this->frequencies as $token => $frequency) { foreach ($this->frequencies as $token => $frequency) {
if (($frequency / $samplesCount) < $this->minDF) { if (($frequency / $samplesCount) < $this->minDF) {
$indexes[] = $this->getTokenIndex($token); $indexes[] = $this->getTokenIndex((string) $token);
} }
} }

View File

@ -29,13 +29,13 @@ class FilesDatasetTest extends TestCase
self::assertEquals($targets, array_values(array_unique($dataset->getTargets()))); self::assertEquals($targets, array_values(array_unique($dataset->getTargets())));
$firstSample = file_get_contents($rootPath.'/business/001.txt'); $firstSample = file_get_contents($rootPath.'/business/001.txt');
self::assertEquals($firstSample, $dataset->getSamples()[0][0]); self::assertEquals($firstSample, $dataset->getSamples()[0]);
$firstTarget = 'business'; $firstTarget = 'business';
self::assertEquals($firstTarget, $dataset->getTargets()[0]); self::assertEquals($firstTarget, $dataset->getTargets()[0]);
$lastSample = file_get_contents($rootPath.'/tech/010.txt'); $lastSample = file_get_contents($rootPath.'/tech/010.txt');
self::assertEquals($lastSample, $dataset->getSamples()[49][0]); self::assertEquals($lastSample, $dataset->getSamples()[49]);
$lastTarget = 'tech'; $lastTarget = 'tech';
self::assertEquals($lastTarget, $dataset->getTargets()[49]); self::assertEquals($lastTarget, $dataset->getTargets()[49]);

View File

@ -84,7 +84,7 @@ class TokenCountVectorizerTest extends TestCase
{ {
// word at least in half samples // word at least in half samples
$samples = [ $samples = [
'Lorem ipsum dolor sit amet', 'Lorem ipsum dolor sit amet 1550',
'Lorem ipsum sit amet', 'Lorem ipsum sit amet',
'ipsum sit amet', 'ipsum sit amet',
'ipsum sit amet', 'ipsum sit amet',
@ -96,6 +96,7 @@ class TokenCountVectorizerTest extends TestCase
2 => 'dolor', 2 => 'dolor',
3 => 'sit', 3 => 'sit',
4 => 'amet', 4 => 'amet',
5 => 1550,
]; ];
$tokensCounts = [ $tokensCounts = [
@ -105,6 +106,7 @@ class TokenCountVectorizerTest extends TestCase
2 => 0, 2 => 0,
3 => 1, 3 => 1,
4 => 1, 4 => 1,
5 => 0,
], ],
[ [
0 => 1, 0 => 1,
@ -112,6 +114,7 @@ class TokenCountVectorizerTest extends TestCase
2 => 0, 2 => 0,
3 => 1, 3 => 1,
4 => 1, 4 => 1,
5 => 0,
], ],
[ [
0 => 0, 0 => 0,
@ -119,6 +122,7 @@ class TokenCountVectorizerTest extends TestCase
2 => 0, 2 => 0,
3 => 1, 3 => 1,
4 => 1, 4 => 1,
5 => 0,
], ],
[ [
0 => 0, 0 => 0,
@ -126,6 +130,7 @@ class TokenCountVectorizerTest extends TestCase
2 => 0, 2 => 0,
3 => 1, 3 => 1,
4 => 1, 4 => 1,
5 => 0,
], ],
]; ];