From 5e02b893e904761cd7e2415b11778cd421494c07 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Wed, 20 Mar 2019 23:22:45 +0100 Subject: [PATCH] Fix FilesDataset arrays and TokenCountVectorizer numeric token (#363) --- CHANGELOG.md | 13 +++++++++++++ src/Dataset/FilesDataset.php | 2 +- src/FeatureExtraction/TokenCountVectorizer.php | 2 +- tests/Dataset/FilesDatasetTest.php | 4 ++-- .../FeatureExtraction/TokenCountVectorizerTest.php | 7 ++++++- 5 files changed, 23 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e00fb1..63507f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,19 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.8.0] - 2019-03-20 +### Added +- [Tokenization] Added NGramTokenizer (#350) +- editorconfig file (#355) +### Fixed +- [Dataset] FilesDataset read samples without additional array (#363) +- [Tokenization] fixed error with numeric token values (#363) +### Changed +- [Math] improved performance with pow and sqrt replacement (#350) +- [Math] reduce duplicated code in distance metrics (#348) +- update phpunit to 7.5.1 (#335) +- code style fixes (#334) + ## [0.7.0] - 2018-11-07 ### Added - [Clustering] added KMeans associative clustering (#262) diff --git a/src/Dataset/FilesDataset.php b/src/Dataset/FilesDataset.php index a159753..daa7192 100644 --- a/src/Dataset/FilesDataset.php +++ b/src/Dataset/FilesDataset.php @@ -29,7 +29,7 @@ class FilesDataset extends ArrayDataset $target = basename($dir); foreach (array_filter(glob($dir.DIRECTORY_SEPARATOR.'*'), 'is_file') as $file) { - $this->samples[] = [file_get_contents($file)]; + $this->samples[] = file_get_contents($file); $this->targets[] = $target; } } diff --git a/src/FeatureExtraction/TokenCountVectorizer.php b/src/FeatureExtraction/TokenCountVectorizer.php index a1e38f4..afd5f33 100644 --- a/src/FeatureExtraction/TokenCountVectorizer.php +++ b/src/FeatureExtraction/TokenCountVectorizer.php @@ -157,7 +157,7 @@ class TokenCountVectorizer implements Transformer $indexes = []; foreach ($this->frequencies as $token => $frequency) { if (($frequency / $samplesCount) < $this->minDF) { - $indexes[] = $this->getTokenIndex($token); + $indexes[] = $this->getTokenIndex((string) $token); } } diff --git a/tests/Dataset/FilesDatasetTest.php b/tests/Dataset/FilesDatasetTest.php index ec30f91..a7ecd97 100644 --- a/tests/Dataset/FilesDatasetTest.php +++ b/tests/Dataset/FilesDatasetTest.php @@ -29,13 +29,13 @@ class FilesDatasetTest extends TestCase self::assertEquals($targets, array_values(array_unique($dataset->getTargets()))); $firstSample = file_get_contents($rootPath.'/business/001.txt'); - self::assertEquals($firstSample, $dataset->getSamples()[0][0]); + self::assertEquals($firstSample, $dataset->getSamples()[0]); $firstTarget = 'business'; self::assertEquals($firstTarget, $dataset->getTargets()[0]); $lastSample = file_get_contents($rootPath.'/tech/010.txt'); - self::assertEquals($lastSample, $dataset->getSamples()[49][0]); + self::assertEquals($lastSample, $dataset->getSamples()[49]); $lastTarget = 'tech'; self::assertEquals($lastTarget, $dataset->getTargets()[49]); diff --git a/tests/FeatureExtraction/TokenCountVectorizerTest.php b/tests/FeatureExtraction/TokenCountVectorizerTest.php index dff9436..1347915 100644 --- a/tests/FeatureExtraction/TokenCountVectorizerTest.php +++ b/tests/FeatureExtraction/TokenCountVectorizerTest.php @@ -84,7 +84,7 @@ class TokenCountVectorizerTest extends TestCase { // word at least in half samples $samples = [ - 'Lorem ipsum dolor sit amet', + 'Lorem ipsum dolor sit amet 1550', 'Lorem ipsum sit amet', 'ipsum sit amet', 'ipsum sit amet', @@ -96,6 +96,7 @@ class TokenCountVectorizerTest extends TestCase 2 => 'dolor', 3 => 'sit', 4 => 'amet', + 5 => 1550, ]; $tokensCounts = [ @@ -105,6 +106,7 @@ class TokenCountVectorizerTest extends TestCase 2 => 0, 3 => 1, 4 => 1, + 5 => 0, ], [ 0 => 1, @@ -112,6 +114,7 @@ class TokenCountVectorizerTest extends TestCase 2 => 0, 3 => 1, 4 => 1, + 5 => 0, ], [ 0 => 0, @@ -119,6 +122,7 @@ class TokenCountVectorizerTest extends TestCase 2 => 0, 3 => 1, 4 => 1, + 5 => 0, ], [ 0 => 0, @@ -126,6 +130,7 @@ class TokenCountVectorizerTest extends TestCase 2 => 0, 3 => 1, 4 => 1, + 5 => 0, ], ];