Implement FeatureUnion 🚀 (#382)

This commit is contained in:
Arkadiusz Kondas 2019-05-14 21:26:25 +02:00 committed by GitHub
parent ff118eb2ba
commit b500f0b648
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 235 additions and 34 deletions

View File

@ -91,6 +91,7 @@ Public datasets are available in a separate repository [php-ai/php-ml-datasets](
* Regression
* Workflow
* [Pipeline](http://php-ml.readthedocs.io/en/latest/machine-learning/workflow/pipeline)
* FeatureUnion
* Neural Network
* [Multilayer Perceptron Classifier](http://php-ml.readthedocs.io/en/latest/machine-learning/neural-network/multilayer-perceptron-classifier/)
* Cross Validation
@ -103,6 +104,9 @@ Public datasets are available in a separate repository [php-ai/php-ml-datasets](
* [Normalization](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/normalization/)
* [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values/)
* LabelEncoder
* LambdaTransformer
* NumberConverter
* ColumnFilter
* Feature Extraction
* [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/)
* NGramTokenizer

72
src/FeatureUnion.php Normal file
View File

@ -0,0 +1,72 @@
<?php
declare(strict_types=1);
namespace Phpml;
use Phpml\Exception\InvalidArgumentException;
final class FeatureUnion implements Transformer
{
/**
* @var Pipeline[]
*/
private $pipelines = [];
/**
* @var Pipeline[]
*/
public function __construct(array $pipelines)
{
if ($pipelines === []) {
throw new InvalidArgumentException('At least one pipeline is required');
}
$this->pipelines = array_map(static function (Pipeline $pipeline): Pipeline {
return $pipeline;
}, $pipelines);
}
public function fit(array $samples, ?array $targets = null): void
{
$originSamples = $samples;
foreach ($this->pipelines as $pipeline) {
foreach ($pipeline->getTransformers() as $transformer) {
$transformer->fit($samples, $targets);
$transformer->transform($samples, $targets);
}
$samples = $originSamples;
}
}
public function transform(array &$samples, ?array &$targets = null): void
{
$this->transformSamples($samples, $targets);
}
public function fitAndTransform(array &$samples, ?array &$targets = null): void
{
$this->transformSamples($samples, $targets, true);
}
private function transformSamples(array &$samples, ?array &$targets = null, bool $fit = false): void
{
$union = [];
$originSamples = $samples;
foreach ($this->pipelines as $pipeline) {
foreach ($pipeline->getTransformers() as $transformer) {
if ($fit) {
$transformer->fit($samples, $targets);
}
$transformer->transform($samples, $targets);
}
foreach ($samples as $index => $sample) {
$union[$index] = array_merge($union[$index] ?? [], is_array($sample) ? $sample : [$sample]);
}
$samples = $originSamples;
}
$samples = $union;
}
}

View File

@ -28,7 +28,7 @@ final class Regression
$errors = [];
foreach ($targets as $index => $target) {
$errors[] = (log(1 + $target) - log(1 + $predictions[$index])) ** 2;
$errors[] = log((1 + $target) / (1 + $predictions[$index])) ** 2;
}
return Mean::arithmetic($errors);

View File

@ -4,7 +4,9 @@ declare(strict_types=1);
namespace Phpml;
class Pipeline implements Estimator
use Phpml\Exception\InvalidOperationException;
class Pipeline implements Estimator, Transformer
{
/**
* @var Transformer[]
@ -12,29 +14,18 @@ class Pipeline implements Estimator
private $transformers = [];
/**
* @var Estimator
* @var Estimator|null
*/
private $estimator;
/**
* @param Transformer[] $transformers
*/
public function __construct(array $transformers, Estimator $estimator)
{
foreach ($transformers as $transformer) {
$this->addTransformer($transformer);
}
$this->estimator = $estimator;
}
public function addTransformer(Transformer $transformer): void
{
$this->transformers[] = $transformer;
}
public function setEstimator(Estimator $estimator): void
public function __construct(array $transformers, ?Estimator $estimator = null)
{
$this->transformers = array_map(static function (Transformer $transformer): Transformer {
return $transformer;
}, $transformers);
$this->estimator = $estimator;
}
@ -46,16 +37,20 @@ class Pipeline implements Estimator
return $this->transformers;
}
public function getEstimator(): Estimator
public function getEstimator(): ?Estimator
{
return $this->estimator;
}
public function train(array $samples, array $targets): void
{
if ($this->estimator === null) {
throw new InvalidOperationException('Pipeline without estimator can\'t use train method');
}
foreach ($this->transformers as $transformer) {
$transformer->fit($samples, $targets);
$transformer->transform($samples);
$transformer->transform($samples, $targets);
}
$this->estimator->train($samples, $targets);
@ -66,15 +61,27 @@ class Pipeline implements Estimator
*/
public function predict(array $samples)
{
$this->transformSamples($samples);
if ($this->estimator === null) {
throw new InvalidOperationException('Pipeline without estimator can\'t use predict method');
}
$this->transform($samples);
return $this->estimator->predict($samples);
}
private function transformSamples(array &$samples): void
public function fit(array $samples, ?array $targets = null): void
{
foreach ($this->transformers as $transformer) {
$transformer->transform($samples);
$transformer->fit($samples, $targets);
$transformer->transform($samples, $targets);
}
}
public function transform(array &$samples, ?array &$targets = null): void
{
foreach ($this->transformers as $transformer) {
$transformer->transform($samples, $targets);
}
}
}

105
tests/FeatureUnionTest.php Normal file
View File

@ -0,0 +1,105 @@
<?php
declare(strict_types=1);
namespace Phpml\Tests;
use Phpml\Exception\InvalidArgumentException;
use Phpml\FeatureUnion;
use Phpml\Pipeline;
use Phpml\Preprocessing\ColumnFilter;
use Phpml\Preprocessing\Imputer;
use Phpml\Preprocessing\Imputer\Strategy\MeanStrategy;
use Phpml\Preprocessing\LabelEncoder;
use Phpml\Preprocessing\LambdaTransformer;
use Phpml\Preprocessing\NumberConverter;
use PHPUnit\Framework\TestCase;
final class FeatureUnionTest extends TestCase
{
public function testFitAndTransform(): void
{
$columns = ['age', 'income', 'sex'];
$samples = [
['23', '100000', 'male'],
['23', '200000', 'female'],
['43', '150000', 'female'],
['33', 'n/a', 'male'],
];
$targets = ['1', '2', '1', '3'];
$union = new FeatureUnion([
new Pipeline([
new ColumnFilter($columns, ['sex']),
new LambdaTransformer(function (array $sample) {
return $sample[0];
}),
new LabelEncoder(),
]),
new Pipeline([
new ColumnFilter($columns, ['age', 'income']),
new NumberConverter(true),
new Imputer(null, new MeanStrategy(), Imputer::AXIS_COLUMN),
]),
]);
$union->fitAndTransform($samples, $targets);
self::assertEquals([
[0, 23.0, 100000.0],
[1, 23.0, 200000.0],
[1, 43.0, 150000.0],
[0, 33.0, 150000.0],
], $samples);
self::assertEquals([1, 2, 1, 3], $targets);
}
public function testFitAndTransformSeparate(): void
{
$columns = ['age', 'income', 'sex'];
$trainSamples = [
['23', '100000', 'male'],
['23', '200000', 'female'],
['43', '150000', 'female'],
['33', 'n/a', 'male'],
];
$testSamples = [
['43', '500000', 'female'],
['13', 'n/a', 'male'],
['53', 'n/a', 'male'],
['43', 'n/a', 'female'],
];
$union = new FeatureUnion([
new Pipeline([
new ColumnFilter($columns, ['sex']),
new LambdaTransformer(function (array $sample) {
return $sample[0];
}),
new LabelEncoder(),
]),
new Pipeline([
new ColumnFilter($columns, ['age', 'income']),
new NumberConverter(),
new Imputer(null, new MeanStrategy(), Imputer::AXIS_COLUMN),
]),
]);
$union->fit($trainSamples);
$union->transform($testSamples);
self::assertEquals([
[1, 43.0, 500000.0],
[0, 13.0, 150000.0],
[0, 53.0, 150000.0],
[1, 43.0, 150000.0],
], $testSamples);
}
public function testNotAllowForEmptyPipelines(): void
{
$this->expectException(InvalidArgumentException::class);
new FeatureUnion([]);
}
}

View File

@ -11,9 +11,9 @@ use Phpml\FeatureSelection\SelectKBest;
use Phpml\ModelManager;
use Phpml\Pipeline;
use Phpml\Preprocessing\Imputer;
use Phpml\Preprocessing\Imputer\Strategy\MeanStrategy;
use Phpml\Preprocessing\Imputer\Strategy\MostFrequentStrategy;
use Phpml\Preprocessing\Normalizer;
use Phpml\Regression\SVR;
use Phpml\Tokenization\WordTokenizer;
use PHPUnit\Framework\TestCase;
@ -32,16 +32,6 @@ class PipelineTest extends TestCase
self::assertEquals($estimator, $pipeline->getEstimator());
}
public function testPipelineEstimatorSetter(): void
{
$pipeline = new Pipeline([new TfIdfTransformer()], new SVC());
$estimator = new SVR();
$pipeline->setEstimator($estimator);
self::assertEquals($estimator, $pipeline->getEstimator());
}
public function testPipelineWorkflow(): void
{
$transformers = [
@ -119,6 +109,29 @@ class PipelineTest extends TestCase
self::assertEquals(['b'], $pipeline->predict([[1, 3, 5]]));
}
public function testPipelineAsTransformer(): void
{
$pipeline = new Pipeline([
new Imputer(null, new MeanStrategy()),
]);
$trainSamples = [
[10, 20, 30],
[20, 30, 40],
[30, 40, 50],
];
$pipeline->fit($trainSamples);
$testSamples = [
[null, null, null],
];
$pipeline->transform($testSamples);
self::assertEquals([[20.0, 30.0, 40.0]], $testSamples);
}
public function testSaveAndRestore(): void
{
$pipeline = new Pipeline([