diff --git a/README.md b/README.md index 61d215a..5b14f0b 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,9 @@ composer require php-ai/php-ml * [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/) * [Tf-idf Transformer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/tf-idf-transformer/) * Datasets + * [Array](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/array-dataset/) * [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset/) + * [Files](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/files-dataset/) * Ready to use: * [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/) * [Wine](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/wine/) diff --git a/docs/index.md b/docs/index.md index c3088e3..938d73f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -61,7 +61,9 @@ composer require php-ai/php-ml * [Token Count Vectorizer](machine-learning/feature-extraction/token-count-vectorizer/) * [Tf-idf Transformer](machine-learning/feature-extraction/tf-idf-transformer/) * Datasets + * [Array](machine-learning/datasets/array-dataset/) * [CSV](machine-learning/datasets/csv-dataset/) + * [Files](machine-learning/datasets/files-dataset/) * Ready to use: * [Iris](machine-learning/datasets/demo/iris/) * [Wine](machine-learning/datasets/demo/wine/) diff --git a/docs/machine-learning/datasets/files-dataset.md b/docs/machine-learning/datasets/files-dataset.md new file mode 100644 index 0000000..969610c --- /dev/null +++ b/docs/machine-learning/datasets/files-dataset.md @@ -0,0 +1,57 @@ +# FilesDataset + +Helper class that loads dataset from files. Use folder names as targets. It extends the `ArrayDataset`. + +### Constructors Parameters + +* $rootPath - (string) path to root folder that contains files dataset + +``` +use Phpml\Dataset\FilesDataset; + +$dataset = new FilesDataset('path/to/data'); +``` + +See [ArrayDataset](machine-learning/datasets/array-dataset/) for more information. + +### Example + +Files structure: + +``` +data + business + 001.txt + 002.txt + ... + entertainment + 001.txt + 002.txt + ... + politics + 001.txt + 002.txt + ... + sport + 001.txt + 002.txt + ... + tech + 001.txt + 002.txt + ... +``` + +Load files data with `FilesDataset`: + +``` +use Phpml\Dataset\FilesDataset; + +$dataset = new FilesDataset('path/to/data'); + +$dataset->getSamples()[0][0] // content from file path/to/data/business/001.txt +$dataset->getTargets()[0] // business + +$dataset->getSamples()[40][0] // content from file path/to/data/tech/001.txt +$dataset->getTargets()[0] // tech +``` diff --git a/mkdocs.yml b/mkdocs.yml index 2634101..f06a08b 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -29,6 +29,7 @@ pages: - Datasets: - Array Dataset: machine-learning/datasets/array-dataset.md - CSV Dataset: machine-learning/datasets/csv-dataset.md + - Files Dataset: machine-learning/datasets/files-dataset.md - Ready to use datasets: - Iris: machine-learning/datasets/demo/iris.md - Wine: machine-learning/datasets/demo/wine.md diff --git a/src/Phpml/Dataset/FilesDataset.php b/src/Phpml/Dataset/FilesDataset.php index f28e09b..6897ba1 100644 --- a/src/Phpml/Dataset/FilesDataset.php +++ b/src/Phpml/Dataset/FilesDataset.php @@ -1,5 +1,6 @@ scanDir($dir); } } @@ -38,10 +39,9 @@ class FilesDataset extends ArrayDataset { $target = basename($dir); - foreach(array_filter(glob($dir. DIRECTORY_SEPARATOR . '*'), 'is_file') as $file) { + foreach (array_filter(glob($dir.DIRECTORY_SEPARATOR.'*'), 'is_file') as $file) { $this->samples[] = [file_get_contents($file)]; $this->targets[] = $target; } } - } diff --git a/tests/Phpml/Dataset/FilesDatasetTest.php b/tests/Phpml/Dataset/FilesDatasetTest.php index 5a5e7e9..5461c2f 100644 --- a/tests/Phpml/Dataset/FilesDatasetTest.php +++ b/tests/Phpml/Dataset/FilesDatasetTest.php @@ -31,8 +31,13 @@ class FilesDatasetTest extends \PHPUnit_Framework_TestCase $firstSample = file_get_contents($rootPath.'/business/001.txt'); $this->assertEquals($firstSample, $dataset->getSamples()[0][0]); + $firstTarget = 'business'; + $this->assertEquals($firstTarget, $dataset->getTargets()[0]); + $lastSample = file_get_contents($rootPath.'/tech/010.txt'); $this->assertEquals($lastSample, $dataset->getSamples()[49][0]); + + $lastTarget = 'tech'; + $this->assertEquals($lastTarget, $dataset->getTargets()[49]); } - }