diff --git a/README.md b/README.md index 83e2c31..3fef8ed 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,7 @@ Public datasets are available in a separate repository [php-ai/php-ml-datasets]( * Regression * Workflow * [Pipeline](http://php-ml.readthedocs.io/en/latest/machine-learning/workflow/pipeline) + * FeatureUnion * Neural Network * [Multilayer Perceptron Classifier](http://php-ml.readthedocs.io/en/latest/machine-learning/neural-network/multilayer-perceptron-classifier/) * Cross Validation @@ -103,6 +104,9 @@ Public datasets are available in a separate repository [php-ai/php-ml-datasets]( * [Normalization](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/normalization/) * [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values/) * LabelEncoder + * LambdaTransformer + * NumberConverter + * ColumnFilter * Feature Extraction * [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/) * NGramTokenizer diff --git a/src/FeatureUnion.php b/src/FeatureUnion.php new file mode 100644 index 0000000..645a421 --- /dev/null +++ b/src/FeatureUnion.php @@ -0,0 +1,72 @@ +pipelines = array_map(static function (Pipeline $pipeline): Pipeline { + return $pipeline; + }, $pipelines); + } + + public function fit(array $samples, ?array $targets = null): void + { + $originSamples = $samples; + foreach ($this->pipelines as $pipeline) { + foreach ($pipeline->getTransformers() as $transformer) { + $transformer->fit($samples, $targets); + $transformer->transform($samples, $targets); + } + $samples = $originSamples; + } + } + + public function transform(array &$samples, ?array &$targets = null): void + { + $this->transformSamples($samples, $targets); + } + + public function fitAndTransform(array &$samples, ?array &$targets = null): void + { + $this->transformSamples($samples, $targets, true); + } + + private function transformSamples(array &$samples, ?array &$targets = null, bool $fit = false): void + { + $union = []; + $originSamples = $samples; + foreach ($this->pipelines as $pipeline) { + foreach ($pipeline->getTransformers() as $transformer) { + if ($fit) { + $transformer->fit($samples, $targets); + } + $transformer->transform($samples, $targets); + } + + foreach ($samples as $index => $sample) { + $union[$index] = array_merge($union[$index] ?? [], is_array($sample) ? $sample : [$sample]); + } + $samples = $originSamples; + } + + $samples = $union; + } +} diff --git a/src/Metric/Regression.php b/src/Metric/Regression.php index 9f0e024..c833f6a 100644 --- a/src/Metric/Regression.php +++ b/src/Metric/Regression.php @@ -28,7 +28,7 @@ final class Regression $errors = []; foreach ($targets as $index => $target) { - $errors[] = (log(1 + $target) - log(1 + $predictions[$index])) ** 2; + $errors[] = log((1 + $target) / (1 + $predictions[$index])) ** 2; } return Mean::arithmetic($errors); diff --git a/src/Pipeline.php b/src/Pipeline.php index 41188f3..421abb5 100644 --- a/src/Pipeline.php +++ b/src/Pipeline.php @@ -4,7 +4,9 @@ declare(strict_types=1); namespace Phpml; -class Pipeline implements Estimator +use Phpml\Exception\InvalidOperationException; + +class Pipeline implements Estimator, Transformer { /** * @var Transformer[] @@ -12,29 +14,18 @@ class Pipeline implements Estimator private $transformers = []; /** - * @var Estimator + * @var Estimator|null */ private $estimator; /** * @param Transformer[] $transformers */ - public function __construct(array $transformers, Estimator $estimator) - { - foreach ($transformers as $transformer) { - $this->addTransformer($transformer); - } - - $this->estimator = $estimator; - } - - public function addTransformer(Transformer $transformer): void - { - $this->transformers[] = $transformer; - } - - public function setEstimator(Estimator $estimator): void + public function __construct(array $transformers, ?Estimator $estimator = null) { + $this->transformers = array_map(static function (Transformer $transformer): Transformer { + return $transformer; + }, $transformers); $this->estimator = $estimator; } @@ -46,16 +37,20 @@ class Pipeline implements Estimator return $this->transformers; } - public function getEstimator(): Estimator + public function getEstimator(): ?Estimator { return $this->estimator; } public function train(array $samples, array $targets): void { + if ($this->estimator === null) { + throw new InvalidOperationException('Pipeline without estimator can\'t use train method'); + } + foreach ($this->transformers as $transformer) { $transformer->fit($samples, $targets); - $transformer->transform($samples); + $transformer->transform($samples, $targets); } $this->estimator->train($samples, $targets); @@ -66,15 +61,27 @@ class Pipeline implements Estimator */ public function predict(array $samples) { - $this->transformSamples($samples); + if ($this->estimator === null) { + throw new InvalidOperationException('Pipeline without estimator can\'t use predict method'); + } + + $this->transform($samples); return $this->estimator->predict($samples); } - private function transformSamples(array &$samples): void + public function fit(array $samples, ?array $targets = null): void { foreach ($this->transformers as $transformer) { - $transformer->transform($samples); + $transformer->fit($samples, $targets); + $transformer->transform($samples, $targets); + } + } + + public function transform(array &$samples, ?array &$targets = null): void + { + foreach ($this->transformers as $transformer) { + $transformer->transform($samples, $targets); } } } diff --git a/tests/FeatureUnionTest.php b/tests/FeatureUnionTest.php new file mode 100644 index 0000000..0a903b4 --- /dev/null +++ b/tests/FeatureUnionTest.php @@ -0,0 +1,105 @@ +fitAndTransform($samples, $targets); + + self::assertEquals([ + [0, 23.0, 100000.0], + [1, 23.0, 200000.0], + [1, 43.0, 150000.0], + [0, 33.0, 150000.0], + ], $samples); + self::assertEquals([1, 2, 1, 3], $targets); + } + + public function testFitAndTransformSeparate(): void + { + $columns = ['age', 'income', 'sex']; + $trainSamples = [ + ['23', '100000', 'male'], + ['23', '200000', 'female'], + ['43', '150000', 'female'], + ['33', 'n/a', 'male'], + ]; + $testSamples = [ + ['43', '500000', 'female'], + ['13', 'n/a', 'male'], + ['53', 'n/a', 'male'], + ['43', 'n/a', 'female'], + ]; + + $union = new FeatureUnion([ + new Pipeline([ + new ColumnFilter($columns, ['sex']), + new LambdaTransformer(function (array $sample) { + return $sample[0]; + }), + new LabelEncoder(), + ]), + new Pipeline([ + new ColumnFilter($columns, ['age', 'income']), + new NumberConverter(), + new Imputer(null, new MeanStrategy(), Imputer::AXIS_COLUMN), + ]), + ]); + + $union->fit($trainSamples); + $union->transform($testSamples); + + self::assertEquals([ + [1, 43.0, 500000.0], + [0, 13.0, 150000.0], + [0, 53.0, 150000.0], + [1, 43.0, 150000.0], + ], $testSamples); + } + + public function testNotAllowForEmptyPipelines(): void + { + $this->expectException(InvalidArgumentException::class); + + new FeatureUnion([]); + } +} diff --git a/tests/PipelineTest.php b/tests/PipelineTest.php index 31c4f36..f905c8b 100644 --- a/tests/PipelineTest.php +++ b/tests/PipelineTest.php @@ -11,9 +11,9 @@ use Phpml\FeatureSelection\SelectKBest; use Phpml\ModelManager; use Phpml\Pipeline; use Phpml\Preprocessing\Imputer; +use Phpml\Preprocessing\Imputer\Strategy\MeanStrategy; use Phpml\Preprocessing\Imputer\Strategy\MostFrequentStrategy; use Phpml\Preprocessing\Normalizer; -use Phpml\Regression\SVR; use Phpml\Tokenization\WordTokenizer; use PHPUnit\Framework\TestCase; @@ -32,16 +32,6 @@ class PipelineTest extends TestCase self::assertEquals($estimator, $pipeline->getEstimator()); } - public function testPipelineEstimatorSetter(): void - { - $pipeline = new Pipeline([new TfIdfTransformer()], new SVC()); - - $estimator = new SVR(); - $pipeline->setEstimator($estimator); - - self::assertEquals($estimator, $pipeline->getEstimator()); - } - public function testPipelineWorkflow(): void { $transformers = [ @@ -119,6 +109,29 @@ class PipelineTest extends TestCase self::assertEquals(['b'], $pipeline->predict([[1, 3, 5]])); } + public function testPipelineAsTransformer(): void + { + $pipeline = new Pipeline([ + new Imputer(null, new MeanStrategy()), + ]); + + $trainSamples = [ + [10, 20, 30], + [20, 30, 40], + [30, 40, 50], + ]; + + $pipeline->fit($trainSamples); + + $testSamples = [ + [null, null, null], + ]; + + $pipeline->transform($testSamples); + + self::assertEquals([[20.0, 30.0, 40.0]], $testSamples); + } + public function testSaveAndRestore(): void { $pipeline = new Pipeline([