mirror of
https://github.com/Llewellynvdm/php-ml.git
synced 2024-11-27 23:36:28 +00:00
Fix FilesDataset arrays and TokenCountVectorizer numeric token (#363)
This commit is contained in:
parent
02dab41830
commit
5e02b893e9
13
CHANGELOG.md
13
CHANGELOG.md
@ -4,6 +4,19 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [0.8.0] - 2019-03-20
|
||||
### Added
|
||||
- [Tokenization] Added NGramTokenizer (#350)
|
||||
- editorconfig file (#355)
|
||||
### Fixed
|
||||
- [Dataset] FilesDataset read samples without additional array (#363)
|
||||
- [Tokenization] fixed error with numeric token values (#363)
|
||||
### Changed
|
||||
- [Math] improved performance with pow and sqrt replacement (#350)
|
||||
- [Math] reduce duplicated code in distance metrics (#348)
|
||||
- update phpunit to 7.5.1 (#335)
|
||||
- code style fixes (#334)
|
||||
|
||||
## [0.7.0] - 2018-11-07
|
||||
### Added
|
||||
- [Clustering] added KMeans associative clustering (#262)
|
||||
|
@ -29,7 +29,7 @@ class FilesDataset extends ArrayDataset
|
||||
$target = basename($dir);
|
||||
|
||||
foreach (array_filter(glob($dir.DIRECTORY_SEPARATOR.'*'), 'is_file') as $file) {
|
||||
$this->samples[] = [file_get_contents($file)];
|
||||
$this->samples[] = file_get_contents($file);
|
||||
$this->targets[] = $target;
|
||||
}
|
||||
}
|
||||
|
@ -157,7 +157,7 @@ class TokenCountVectorizer implements Transformer
|
||||
$indexes = [];
|
||||
foreach ($this->frequencies as $token => $frequency) {
|
||||
if (($frequency / $samplesCount) < $this->minDF) {
|
||||
$indexes[] = $this->getTokenIndex($token);
|
||||
$indexes[] = $this->getTokenIndex((string) $token);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -29,13 +29,13 @@ class FilesDatasetTest extends TestCase
|
||||
self::assertEquals($targets, array_values(array_unique($dataset->getTargets())));
|
||||
|
||||
$firstSample = file_get_contents($rootPath.'/business/001.txt');
|
||||
self::assertEquals($firstSample, $dataset->getSamples()[0][0]);
|
||||
self::assertEquals($firstSample, $dataset->getSamples()[0]);
|
||||
|
||||
$firstTarget = 'business';
|
||||
self::assertEquals($firstTarget, $dataset->getTargets()[0]);
|
||||
|
||||
$lastSample = file_get_contents($rootPath.'/tech/010.txt');
|
||||
self::assertEquals($lastSample, $dataset->getSamples()[49][0]);
|
||||
self::assertEquals($lastSample, $dataset->getSamples()[49]);
|
||||
|
||||
$lastTarget = 'tech';
|
||||
self::assertEquals($lastTarget, $dataset->getTargets()[49]);
|
||||
|
@ -84,7 +84,7 @@ class TokenCountVectorizerTest extends TestCase
|
||||
{
|
||||
// word at least in half samples
|
||||
$samples = [
|
||||
'Lorem ipsum dolor sit amet',
|
||||
'Lorem ipsum dolor sit amet 1550',
|
||||
'Lorem ipsum sit amet',
|
||||
'ipsum sit amet',
|
||||
'ipsum sit amet',
|
||||
@ -96,6 +96,7 @@ class TokenCountVectorizerTest extends TestCase
|
||||
2 => 'dolor',
|
||||
3 => 'sit',
|
||||
4 => 'amet',
|
||||
5 => 1550,
|
||||
];
|
||||
|
||||
$tokensCounts = [
|
||||
@ -105,6 +106,7 @@ class TokenCountVectorizerTest extends TestCase
|
||||
2 => 0,
|
||||
3 => 1,
|
||||
4 => 1,
|
||||
5 => 0,
|
||||
],
|
||||
[
|
||||
0 => 1,
|
||||
@ -112,6 +114,7 @@ class TokenCountVectorizerTest extends TestCase
|
||||
2 => 0,
|
||||
3 => 1,
|
||||
4 => 1,
|
||||
5 => 0,
|
||||
],
|
||||
[
|
||||
0 => 0,
|
||||
@ -119,6 +122,7 @@ class TokenCountVectorizerTest extends TestCase
|
||||
2 => 0,
|
||||
3 => 1,
|
||||
4 => 1,
|
||||
5 => 0,
|
||||
],
|
||||
[
|
||||
0 => 0,
|
||||
@ -126,6 +130,7 @@ class TokenCountVectorizerTest extends TestCase
|
||||
2 => 0,
|
||||
3 => 1,
|
||||
4 => 1,
|
||||
5 => 0,
|
||||
],
|
||||
];
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user