Fix samples transformation in Pipeline training (#94)

This commit is contained in:
Maxime COLIN 2017-05-24 09:06:54 +02:00 committed by Arkadiusz Kondas
parent de50490154
commit 2d3b44f1a0
2 changed files with 44 additions and 12 deletions

View File

@ -67,8 +67,11 @@ class Pipeline implements Estimator
*/
public function train(array $samples, array $targets)
{
$this->fitTransformers($samples);
$this->transformSamples($samples);
foreach ($this->transformers as $transformer) {
$transformer->fit($samples);
$transformer->transform($samples);
}
$this->estimator->train($samples, $targets);
}
@ -84,16 +87,6 @@ class Pipeline implements Estimator
return $this->estimator->predict($samples);
}
/**
* @param array $samples
*/
private function fitTransformers(array &$samples)
{
foreach ($this->transformers as $transformer) {
$transformer->fit($samples);
}
}
/**
* @param array $samples
*/

View File

@ -6,11 +6,13 @@ namespace tests;
use Phpml\Classification\SVC;
use Phpml\FeatureExtraction\TfIdfTransformer;
use Phpml\FeatureExtraction\TokenCountVectorizer;
use Phpml\Pipeline;
use Phpml\Preprocessing\Imputer;
use Phpml\Preprocessing\Normalizer;
use Phpml\Preprocessing\Imputer\Strategy\MostFrequentStrategy;
use Phpml\Regression\SVR;
use Phpml\Tokenization\WordTokenizer;
use PHPUnit\Framework\TestCase;
class PipelineTest extends TestCase
@ -65,4 +67,41 @@ class PipelineTest extends TestCase
$this->assertEquals(4, $predicted[0]);
}
public function testPipelineTransformers()
{
$transformers = [
new TokenCountVectorizer(new WordTokenizer()),
new TfIdfTransformer()
];
$estimator = new SVC();
$samples = [
'Hello Paul',
'Hello Martin',
'Goodbye Tom',
'Hello John',
'Goodbye Alex',
'Bye Tony',
];
$targets = [
'greetings',
'greetings',
'farewell',
'greetings',
'farewell',
'farewell',
];
$pipeline = new Pipeline($transformers, $estimator);
$pipeline->train($samples, $targets);
$expected = ['greetings', 'farewell'];
$predicted = $pipeline->predict(['Hello Max', 'Goodbye Mark']);
$this->assertEquals($expected, $predicted);
}
}