Fix FilesDataset arrays and TokenCountVectorizer numeric token (#363)

This commit is contained in:
Arkadiusz Kondas 2019-03-20 23:22:45 +01:00 committed by GitHub
parent 02dab41830
commit 5e02b893e9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 23 additions and 5 deletions

View File

@ -4,6 +4,19 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [0.8.0] - 2019-03-20
### Added
- [Tokenization] Added NGramTokenizer (#350)
- editorconfig file (#355)
### Fixed
- [Dataset] FilesDataset read samples without additional array (#363)
- [Tokenization] fixed error with numeric token values (#363)
### Changed
- [Math] improved performance with pow and sqrt replacement (#350)
- [Math] reduce duplicated code in distance metrics (#348)
- update phpunit to 7.5.1 (#335)
- code style fixes (#334)
## [0.7.0] - 2018-11-07
### Added
- [Clustering] added KMeans associative clustering (#262)

View File

@ -29,7 +29,7 @@ class FilesDataset extends ArrayDataset
$target = basename($dir);
foreach (array_filter(glob($dir.DIRECTORY_SEPARATOR.'*'), 'is_file') as $file) {
$this->samples[] = [file_get_contents($file)];
$this->samples[] = file_get_contents($file);
$this->targets[] = $target;
}
}

View File

@ -157,7 +157,7 @@ class TokenCountVectorizer implements Transformer
$indexes = [];
foreach ($this->frequencies as $token => $frequency) {
if (($frequency / $samplesCount) < $this->minDF) {
$indexes[] = $this->getTokenIndex($token);
$indexes[] = $this->getTokenIndex((string) $token);
}
}

View File

@ -29,13 +29,13 @@ class FilesDatasetTest extends TestCase
self::assertEquals($targets, array_values(array_unique($dataset->getTargets())));
$firstSample = file_get_contents($rootPath.'/business/001.txt');
self::assertEquals($firstSample, $dataset->getSamples()[0][0]);
self::assertEquals($firstSample, $dataset->getSamples()[0]);
$firstTarget = 'business';
self::assertEquals($firstTarget, $dataset->getTargets()[0]);
$lastSample = file_get_contents($rootPath.'/tech/010.txt');
self::assertEquals($lastSample, $dataset->getSamples()[49][0]);
self::assertEquals($lastSample, $dataset->getSamples()[49]);
$lastTarget = 'tech';
self::assertEquals($lastTarget, $dataset->getTargets()[49]);

View File

@ -84,7 +84,7 @@ class TokenCountVectorizerTest extends TestCase
{
// word at least in half samples
$samples = [
'Lorem ipsum dolor sit amet',
'Lorem ipsum dolor sit amet 1550',
'Lorem ipsum sit amet',
'ipsum sit amet',
'ipsum sit amet',
@ -96,6 +96,7 @@ class TokenCountVectorizerTest extends TestCase
2 => 'dolor',
3 => 'sit',
4 => 'amet',
5 => 1550,
];
$tokensCounts = [
@ -105,6 +106,7 @@ class TokenCountVectorizerTest extends TestCase
2 => 0,
3 => 1,
4 => 1,
5 => 0,
],
[
0 => 1,
@ -112,6 +114,7 @@ class TokenCountVectorizerTest extends TestCase
2 => 0,
3 => 1,
4 => 1,
5 => 0,
],
[
0 => 0,
@ -119,6 +122,7 @@ class TokenCountVectorizerTest extends TestCase
2 => 0,
3 => 1,
4 => 1,
5 => 0,
],
[
0 => 0,
@ -126,6 +130,7 @@ class TokenCountVectorizerTest extends TestCase
2 => 0,
3 => 1,
4 => 1,
5 => 0,
],
];