From c8bd8db6019e0fdeb4c7907282e306fbd8675013 Mon Sep 17 00:00:00 2001 From: Patrick Florek Date: Tue, 23 Aug 2016 15:44:53 +0200 Subject: [PATCH 1/2] # Association rule learning - Apriori algorithm * Generating frequent k-length item sets * Generating rules based on frequent item sets * Algorithm has exponential complexity, be aware of it * Apriori algorithm is split into apriori and candidates method * Second step rule generation is implemented by rules method * Internal methods are invoked for fine grain unit tests * Wikipedia's train samples and an alternative are provided for test cases * Small documentation for public interface is also shipped --- .gitignore | 1 + docs/machine-learning/association/apriori.md | 54 +++ src/Phpml/Association/Apriori.php | 325 +++++++++++++++++++ src/Phpml/Association/Associator.php | 11 + tests/Phpml/Association/AprioriTest.php | 187 +++++++++++ 5 files changed, 578 insertions(+) create mode 100644 docs/machine-learning/association/apriori.md create mode 100644 src/Phpml/Association/Apriori.php create mode 100644 src/Phpml/Association/Associator.php create mode 100644 tests/Phpml/Association/AprioriTest.php diff --git a/.gitignore b/.gitignore index 8a409f4..e85e1fd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +/.idea/ /vendor/ humbuglog.* /bin/phpunit diff --git a/docs/machine-learning/association/apriori.md b/docs/machine-learning/association/apriori.md new file mode 100644 index 0000000..c5986f4 --- /dev/null +++ b/docs/machine-learning/association/apriori.md @@ -0,0 +1,54 @@ +# Apriori Associator + +Association rule learning based on [Apriori algorithm](https://en.wikipedia.org/wiki/Apriori_algorithm) for frequent item set mining. + +### Constructor Parameters + +* $support - [confidence](https://en.wikipedia.org/wiki/Association_rule_learning#Support), minimum relative amount of frequent item set in train sample +* $confidence - [confidence](https://en.wikipedia.org/wiki/Association_rule_learning#Confidence), minimum relative amount of item set in frequent item sets + +``` +$associator = new \Phpml\Association\Apriori($support = 0.5, $confidence = 0.5); +``` + +### Train + +To train a associator simply provide train samples and labels (as `array`). Example: + +``` +$samples = [['alpha', 'beta', 'epsilon'], ['alpha', 'beta', 'theta'], ['alpha', 'beta', 'epsilon'], ['alpha', 'beta', 'theta']]; +$labels = []; + +$associator = new \Phpml\Association\Apriori(0.5, 0.5); +$associator->train($samples, $labels); +``` + +### Predict + +To predict sample label use `predict` method. You can provide one sample or array of samples: + +``` +$associator->predict(['alpha','theta']); +// return [[['beta']]] + +$associator->predict([['alpha','epsilon'],['beta','theta']]); +// return [[['beta']], [['alpha']]] +``` + +### Associating + +Generating association rules simply use `rules` method. + +``` +$associator->rules(); +// return [['antecedent' => ['alpha', 'theta'], 'consequent' => ['beta], 'support' => 1.0, 'confidence' => 1.0], ... ] +``` + +### Frequent item sets + +Generating k-length frequent item sets simply use `apriori` method. + +``` +$associator->apriori(); +// return [ 1 => [['alpha'], ['beta'], ['theta'], ['epsilon']], 2 => [...], ...] +``` diff --git a/src/Phpml/Association/Apriori.php b/src/Phpml/Association/Apriori.php new file mode 100644 index 0000000..bf52c27 --- /dev/null +++ b/src/Phpml/Association/Apriori.php @@ -0,0 +1,325 @@ +support = $support; + $this->confidence = $confidence; + } + + /** + * Generates apriori association rules. + * + * @return mixed[][] + */ + public function rules() + { + if (!$this->large) { + $this->large = $this->apriori(); + } + + if ($this->rules) { + return $this->rules; + } + + $this->rules = []; + + for ($k = 2; !empty($this->large[$k]); ++$k) { + foreach ($this->large[$k] as $frequent) { + foreach ($this->antecedents($frequent) as $antecedent) { + if ($this->confidence <= ($confidence = $this->confidence($frequent, $antecedent))) { + $consequent = array_values(array_diff($frequent, $antecedent)); + $this->rules[] = [ + self::ARRAY_KEY_ANTECEDENT => $antecedent, + self::ARRAY_KEY_CONSEQUENT => $consequent, + self::ARRAY_KEY_SUPPORT => $this->support($consequent), + self::ARRAY_KEY_CONFIDENCE => $confidence, + ]; + } + } + } + } + + return $this->rules; + } + + /** + * Generates frequent item sets + * + * @return mixed[][][] + */ + public function apriori() + { + $L = []; + $L[1] = $this->items(); + $L[1] = $this->frequent($L[1]); + + for ($k = 2; !empty($L[$k - 1]); ++$k) { + $L[$k] = $this->candidates($L[$k - 1]); + $L[$k] = $this->frequent($L[$k]); + } + + return $L; + } + + /** + * @param mixed[] $sample + * + * @return mixed[][] + */ + protected function predictSample(array $sample) + { + $predicts = array_values(array_filter($this->rules(), function($rule) use ($sample) { + return $this->equals($rule[self::ARRAY_KEY_ANTECEDENT], $sample); + })); + + return array_map(function($rule) { return $rule[self::ARRAY_KEY_CONSEQUENT]; }, $predicts); + } + + /** + * Generates the power set for given item set $sample. + * + * @param mixed[] $sample + * + * @return mixed[][] + */ + private function powerSet(array $sample) + { + $results = [[]]; + foreach ($sample as $item) { + foreach ($results as $combination) { + $results[] = array_merge(array($item), $combination); + } + } + + return $results; + } + + /** + * Generates all proper subsets for given set $sample without the empty set. + * + * @param mixed[] $sample + * + * @return mixed[][] + */ + private function antecedents(array $sample) + { + $cardinality = count($sample); + $antecedents = $this->powerSet($sample); + + return array_filter($antecedents, function($antecedent) use ($cardinality) { + return (count($antecedent) != $cardinality) && ($antecedent != []); + }); + } + + /** + * Calculates frequent k = 1 item sets. + * + * @return mixed[][] + */ + private function items() + { + $items = []; + + foreach ($this->samples as $sample) { + foreach ($sample as $item) { + if (!in_array($item, $items, true)) { + $items[] = $item; + } + } + } + + return array_map(function($entry) { + return [$entry]; + }, $items); + } + + /** + * Returns frequent item sets only. + * + * @param mixed[][] $samples + * + * @return mixed[][] + */ + private function frequent(array $samples) + { + return array_filter($samples, function($entry) { + return $this->support($entry) >= $this->support; + }); + } + + /** + * Calculates frequent k item sets, where count($samples) == $k - 1. + * + * @param mixed[][] $samples + * + * @return mixed[][] + */ + private function candidates(array $samples) + { + $candidates = []; + + foreach ($samples as $p) { + foreach ($samples as $q) { + if (count(array_merge(array_diff($p, $q), array_diff($q, $p))) != 2) { + continue; + } + + $candidate = array_unique(array_merge($p, $q)); + + if ($this->contains($candidates, $candidate)) { + continue; + } + + foreach ((array)$this->samples as $sample) { + if ($this->subset($sample, $candidate)) { + $candidates[] = $candidate; + continue 2; + } + } + } + } + + return $candidates; + } + + /** + * Calculates confidence for $set. Confidence is the relative amount of sets containing $subset which also contain + * $set. + * + * @param mixed[] $set + * @param mixed[] $subset + * + * @return float + */ + private function confidence(array $set, array $subset) + { + return $this->support($set) / $this->support($subset); + } + + /** + * Calculates support for item set $sample. Support is the relative amount of sets containing $sample in the data + * pool. + * + * @see \Phpml\Association\Apriori::samples + * + * @param mixed[] $sample + * + * @return float + */ + private function support(array $sample) + { + return $this->frequency($sample) / count($this->samples); + } + + /** + * Counts occurrences of $sample as subset in data pool. + * + * @see \Phpml\Association\Apriori::samples + * + * @param mixed[] $sample + * + * @return int + */ + private function frequency(array $sample) + { + return count(array_filter($this->samples, function($entry) use ($sample) { + return $this->subset($entry, $sample); + })); + } + + /** + * Returns true if set is an element of system. + * + * @see \Phpml\Association\Apriori::equals() + * + * @param mixed[][] $system + * @param mixed[] $set + * + * @return bool + */ + private function contains(array $system, array $set) + { + return (bool)array_filter($system, function($entry) use ($set) { + return $this->equals($entry, $set); + }); + } + + /** + * Returns true if subset is a (proper) subset of set by its items string representation. + * + * @param mixed[] $set + * @param mixed[] $subset + * + * @return bool + */ + private function subset(array $set, array $subset) + { + return !array_diff($subset, array_intersect($subset, $set)); + } + + /** + * Returns true if string representation of items does not differ. + * + * @param mixed[] $set1 + * @param mixed[] $set2 + * + * @return bool + */ + private function equals(array $set1, array $set2) + { + return array_diff($set1, $set2) == array_diff($set2, $set1); + } +} diff --git a/src/Phpml/Association/Associator.php b/src/Phpml/Association/Associator.php new file mode 100644 index 0000000..c339b5e --- /dev/null +++ b/src/Phpml/Association/Associator.php @@ -0,0 +1,11 @@ +train($this->sampleGreek, []); + + $this->assertEquals('beta', $apriori->predict([['alpha', 'epsilon'], ['beta', 'theta']])[0][0][0]); + $this->assertEquals('alpha', $apriori->predict([['alpha', 'epsilon'], ['beta', 'theta']])[1][0][0]); + } + + public function testPowerSet() + { + $apriori = new Apriori(); + + $this->assertCount(8, $this->invoke($apriori, 'powerSet', [['a', 'b', 'c']])); + } + + public function testApriori() + { + $apriori = new Apriori(3 / 7); + $apriori->train($this->sampleBasket, []); + + $L = $apriori->apriori(); + + $this->assertCount(0, $L[3]); + $this->assertCount(4, $L[2]); + $this->assertTrue($this->invoke($apriori, 'contains', [$L[2], [1, 2]])); + $this->assertFalse($this->invoke($apriori, 'contains', [$L[2], [1, 3]])); + $this->assertFalse($this->invoke($apriori, 'contains', [$L[2], [1, 4]])); + $this->assertTrue($this->invoke($apriori, 'contains', [$L[2], [2, 3]])); + $this->assertTrue($this->invoke($apriori, 'contains', [$L[2], [2, 4]])); + $this->assertTrue($this->invoke($apriori, 'contains', [$L[2], [3, 4]])); + } + + public function testRules() + { + $apriori = new Apriori(0.4, 0.8); + $apriori->train($this->sampleChars, []); + + $this->assertCount(19, $apriori->rules()); + } + + public function testAntecedents() + { + $apriori = new Apriori(); + + $this->assertCount(6, $this->invoke($apriori, 'antecedents', [['a', 'b', 'c']])); + } + + public function testItems() + { + $apriori = new Apriori(); + $apriori->train($this->sampleGreek, []); + $this->assertCount(4, $this->invoke($apriori, 'items', [])); + } + + public function testFrequent() + { + $apriori = new Apriori(0.51); + $apriori->train($this->sampleGreek, []); + + $this->assertCount(0, $this->invoke($apriori, 'frequent', [[['epsilon'], ['theta']]])); + $this->assertCount(2, $this->invoke($apriori, 'frequent', [[['alpha'], ['beta']]])); + } + + public function testCandidates() + { + $apriori = new Apriori(); + $apriori->train($this->sampleGreek, []); + + $this->assertArraySubset([0 => ['alpha', 'beta']], $this->invoke($apriori, 'candidates', [[['alpha'], ['beta'], ['theta']]])); + $this->assertArraySubset([1 => ['alpha', 'theta']], $this->invoke($apriori, 'candidates', [[['alpha'], ['beta'], ['theta']]])); + $this->assertArraySubset([2 => ['beta', 'theta']], $this->invoke($apriori, 'candidates', [[['alpha'], ['beta'], ['theta']]])); + $this->assertCount(3, $this->invoke($apriori, 'candidates', [[['alpha'], ['beta'], ['theta']]])); + } + + public function testConfidence() + { + $apriori = new Apriori(); + $apriori->train($this->sampleGreek, []); + + $this->assertEquals(0.5, $this->invoke($apriori, 'confidence', [['alpha', 'beta', 'theta'], ['alpha', 'beta']])); + $this->assertEquals(1, $this->invoke($apriori, 'confidence', [['alpha', 'beta'], ['alpha']])); + } + + public function testSupport() + { + $apriori = new Apriori(); + $apriori->train($this->sampleGreek, []); + + $this->assertEquals(1.0, $this->invoke($apriori, 'support', [['alpha', 'beta']])); + $this->assertEquals(0.5, $this->invoke($apriori, 'support', [['epsilon']])); + } + + public function testFrequency() + { + $apriori = new Apriori(); + $apriori->train($this->sampleGreek, []); + + $this->assertEquals(4, $this->invoke($apriori, 'frequency', [['alpha', 'beta']])); + $this->assertEquals(2, $this->invoke($apriori, 'frequency', [['epsilon']])); + } + + public function testContains() + { + $apriori = new Apriori(); + + $this->assertTrue($this->invoke($apriori, 'contains', [[['a'], ['b']], ['a']])); + $this->assertTrue($this->invoke($apriori, 'contains', [[[1, 2]], [1, 2]])); + $this->assertFalse($this->invoke($apriori, 'contains', [[['a'], ['b']], ['c']])); + } + + public function testSubset() + { + $apriori = new Apriori(); + + $this->assertTrue($this->invoke($apriori, 'subset', [['a', 'b'], ['a']])); + $this->assertTrue($this->invoke($apriori, 'subset', [['a'], ['a']])); + $this->assertFalse($this->invoke($apriori, 'subset', [['a'], ['a', 'b']])); + } + + public function testEquals() + { + $apriori = new Apriori(); + + $this->assertTrue($this->invoke($apriori, 'equals', [['a'], ['a']])); + $this->assertFalse($this->invoke($apriori, 'equals', [['a'], []])); + $this->assertFalse($this->invoke($apriori, 'equals', [['a'], ['b', 'a']])); + } + + /** + * Invokes objects method. Private/protected will be set accessible. + * + * @param object &$object Instantiated object to be called on + * @param string $method Method name to be called + * @param array $params Array of params to be passed + * + * @return mixed + */ + public function invoke(&$object, $method, array $params = array()) + { + $reflection = new \ReflectionClass(get_class($object)); + $method = $reflection->getMethod($method); + $method->setAccessible(true); + + return $method->invokeArgs($object, $params); + } +} From 90038befa9e3505ca794ed14291dc8dd1d615b21 Mon Sep 17 00:00:00 2001 From: Patrick Florek Date: Fri, 2 Sep 2016 00:18:50 +0200 Subject: [PATCH 2/2] Apply comments / coding styles * Remove user-specific gitignore * Add return type hints * Avoid global namespace in docs * Rename rules -> getRules * Split up rule generation Todo: * Move set theory out to math * Extract rule generation --- .gitignore | 1 - docs/machine-learning/association/apriori.md | 12 ++- src/Phpml/Association/Apriori.php | 106 +++++++++++-------- tests/Phpml/Association/AprioriTest.php | 4 +- 4 files changed, 73 insertions(+), 50 deletions(-) diff --git a/.gitignore b/.gitignore index e85e1fd..8a409f4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ -/.idea/ /vendor/ humbuglog.* /bin/phpunit diff --git a/docs/machine-learning/association/apriori.md b/docs/machine-learning/association/apriori.md index c5986f4..544406e 100644 --- a/docs/machine-learning/association/apriori.md +++ b/docs/machine-learning/association/apriori.md @@ -8,7 +8,9 @@ Association rule learning based on [Apriori algorithm](https://en.wikipedia.org/ * $confidence - [confidence](https://en.wikipedia.org/wiki/Association_rule_learning#Confidence), minimum relative amount of item set in frequent item sets ``` -$associator = new \Phpml\Association\Apriori($support = 0.5, $confidence = 0.5); +use Phpml\Association\Apriori; + +$associator = new Apriori($support = 0.5, $confidence = 0.5); ``` ### Train @@ -19,7 +21,9 @@ To train a associator simply provide train samples and labels (as `array`). Exam $samples = [['alpha', 'beta', 'epsilon'], ['alpha', 'beta', 'theta'], ['alpha', 'beta', 'epsilon'], ['alpha', 'beta', 'theta']]; $labels = []; -$associator = new \Phpml\Association\Apriori(0.5, 0.5); +use Phpml\Association\Apriori; + +$associator = new Apriori($support = 0.5, $confidence = 0.5); $associator->train($samples, $labels); ``` @@ -37,10 +41,10 @@ $associator->predict([['alpha','epsilon'],['beta','theta']]); ### Associating -Generating association rules simply use `rules` method. +Get generated association rules simply use `rules` method. ``` -$associator->rules(); +$associator->getRules(); // return [['antecedent' => ['alpha', 'theta'], 'consequent' => ['beta], 'support' => 1.0, 'confidence' => 1.0], ... ] ``` diff --git a/src/Phpml/Association/Apriori.php b/src/Phpml/Association/Apriori.php index bf52c27..4855691 100644 --- a/src/Phpml/Association/Apriori.php +++ b/src/Phpml/Association/Apriori.php @@ -1,6 +1,6 @@ support = $support; + $this->support = $support; $this->confidence = $confidence; } /** - * Generates apriori association rules. + * Get all association rules which are generated for every k-length frequent item set. * * @return mixed[][] */ - public function rules() + public function getRules() : array { if (!$this->large) { $this->large = $this->apriori(); @@ -76,33 +76,19 @@ class Apriori implements Associator $this->rules = []; - for ($k = 2; !empty($this->large[$k]); ++$k) { - foreach ($this->large[$k] as $frequent) { - foreach ($this->antecedents($frequent) as $antecedent) { - if ($this->confidence <= ($confidence = $this->confidence($frequent, $antecedent))) { - $consequent = array_values(array_diff($frequent, $antecedent)); - $this->rules[] = [ - self::ARRAY_KEY_ANTECEDENT => $antecedent, - self::ARRAY_KEY_CONSEQUENT => $consequent, - self::ARRAY_KEY_SUPPORT => $this->support($consequent), - self::ARRAY_KEY_CONFIDENCE => $confidence, - ]; - } - } - } - } + $this->generateAllRules(); return $this->rules; } /** - * Generates frequent item sets + * Generates frequent item sets. * * @return mixed[][][] */ - public function apriori() + public function apriori() : array { - $L = []; + $L = []; $L[1] = $this->items(); $L[1] = $this->frequent($L[1]); @@ -119,13 +105,47 @@ class Apriori implements Associator * * @return mixed[][] */ - protected function predictSample(array $sample) + protected function predictSample(array $sample) : array { - $predicts = array_values(array_filter($this->rules(), function($rule) use ($sample) { + $predicts = array_values(array_filter($this->getRules(), function ($rule) use ($sample) { return $this->equals($rule[self::ARRAY_KEY_ANTECEDENT], $sample); })); - return array_map(function($rule) { return $rule[self::ARRAY_KEY_CONSEQUENT]; }, $predicts); + return array_map(function ($rule) { + return $rule[self::ARRAY_KEY_CONSEQUENT]; + }, $predicts); + } + + /** + * Generate rules for each k-length frequent item set. + */ + private function generateAllRules() + { + for ($k = 2; !empty($this->large[$k]); ++$k) { + foreach ($this->large[$k] as $frequent) { + $this->generateRules($frequent); + } + } + } + + /** + * Generate confident rules for frequent item set. + * + * @param mixed[] $frequent + */ + private function generateRules(array $frequent) + { + foreach ($this->antecedents($frequent) as $antecedent) { + if ($this->confidence <= ($confidence = $this->confidence($frequent, $antecedent))) { + $consequent = array_values(array_diff($frequent, $antecedent)); + $this->rules[] = [ + self::ARRAY_KEY_ANTECEDENT => $antecedent, + self::ARRAY_KEY_CONSEQUENT => $consequent, + self::ARRAY_KEY_SUPPORT => $this->support($consequent), + self::ARRAY_KEY_CONFIDENCE => $confidence, + ]; + } + } } /** @@ -135,7 +155,7 @@ class Apriori implements Associator * * @return mixed[][] */ - private function powerSet(array $sample) + private function powerSet(array $sample) : array { $results = [[]]; foreach ($sample as $item) { @@ -154,12 +174,12 @@ class Apriori implements Associator * * @return mixed[][] */ - private function antecedents(array $sample) + private function antecedents(array $sample) : array { $cardinality = count($sample); $antecedents = $this->powerSet($sample); - return array_filter($antecedents, function($antecedent) use ($cardinality) { + return array_filter($antecedents, function ($antecedent) use ($cardinality) { return (count($antecedent) != $cardinality) && ($antecedent != []); }); } @@ -169,7 +189,7 @@ class Apriori implements Associator * * @return mixed[][] */ - private function items() + private function items() : array { $items = []; @@ -181,7 +201,7 @@ class Apriori implements Associator } } - return array_map(function($entry) { + return array_map(function ($entry) { return [$entry]; }, $items); } @@ -193,9 +213,9 @@ class Apriori implements Associator * * @return mixed[][] */ - private function frequent(array $samples) + private function frequent(array $samples) : array { - return array_filter($samples, function($entry) { + return array_filter($samples, function ($entry) { return $this->support($entry) >= $this->support; }); } @@ -207,7 +227,7 @@ class Apriori implements Associator * * @return mixed[][] */ - private function candidates(array $samples) + private function candidates(array $samples) : array { $candidates = []; @@ -223,7 +243,7 @@ class Apriori implements Associator continue; } - foreach ((array)$this->samples as $sample) { + foreach ((array) $this->samples as $sample) { if ($this->subset($sample, $candidate)) { $candidates[] = $candidate; continue 2; @@ -244,7 +264,7 @@ class Apriori implements Associator * * @return float */ - private function confidence(array $set, array $subset) + private function confidence(array $set, array $subset) : float { return $this->support($set) / $this->support($subset); } @@ -259,7 +279,7 @@ class Apriori implements Associator * * @return float */ - private function support(array $sample) + private function support(array $sample) : float { return $this->frequency($sample) / count($this->samples); } @@ -273,9 +293,9 @@ class Apriori implements Associator * * @return int */ - private function frequency(array $sample) + private function frequency(array $sample) : int { - return count(array_filter($this->samples, function($entry) use ($sample) { + return count(array_filter($this->samples, function ($entry) use ($sample) { return $this->subset($entry, $sample); })); } @@ -290,9 +310,9 @@ class Apriori implements Associator * * @return bool */ - private function contains(array $system, array $set) + private function contains(array $system, array $set) : bool { - return (bool)array_filter($system, function($entry) use ($set) { + return (bool) array_filter($system, function ($entry) use ($set) { return $this->equals($entry, $set); }); } @@ -305,7 +325,7 @@ class Apriori implements Associator * * @return bool */ - private function subset(array $set, array $subset) + private function subset(array $set, array $subset) : bool { return !array_diff($subset, array_intersect($subset, $set)); } @@ -318,7 +338,7 @@ class Apriori implements Associator * * @return bool */ - private function equals(array $set1, array $set2) + private function equals(array $set1, array $set2) : bool { return array_diff($set1, $set2) == array_diff($set2, $set1); } diff --git a/tests/Phpml/Association/AprioriTest.php b/tests/Phpml/Association/AprioriTest.php index 9cc595d..b249ff6 100644 --- a/tests/Phpml/Association/AprioriTest.php +++ b/tests/Phpml/Association/AprioriTest.php @@ -71,12 +71,12 @@ class AprioriTest extends \PHPUnit_Framework_TestCase $this->assertTrue($this->invoke($apriori, 'contains', [$L[2], [3, 4]])); } - public function testRules() + public function testGetRules() { $apriori = new Apriori(0.4, 0.8); $apriori->train($this->sampleChars, []); - $this->assertCount(19, $apriori->rules()); + $this->assertCount(19, $apriori->getRules()); } public function testAntecedents()