From c8bd8db6019e0fdeb4c7907282e306fbd8675013 Mon Sep 17 00:00:00 2001 From: Patrick Florek Date: Tue, 23 Aug 2016 15:44:53 +0200 Subject: [PATCH] # Association rule learning - Apriori algorithm * Generating frequent k-length item sets * Generating rules based on frequent item sets * Algorithm has exponential complexity, be aware of it * Apriori algorithm is split into apriori and candidates method * Second step rule generation is implemented by rules method * Internal methods are invoked for fine grain unit tests * Wikipedia's train samples and an alternative are provided for test cases * Small documentation for public interface is also shipped --- .gitignore | 1 + docs/machine-learning/association/apriori.md | 54 +++ src/Phpml/Association/Apriori.php | 325 +++++++++++++++++++ src/Phpml/Association/Associator.php | 11 + tests/Phpml/Association/AprioriTest.php | 187 +++++++++++ 5 files changed, 578 insertions(+) create mode 100644 docs/machine-learning/association/apriori.md create mode 100644 src/Phpml/Association/Apriori.php create mode 100644 src/Phpml/Association/Associator.php create mode 100644 tests/Phpml/Association/AprioriTest.php diff --git a/.gitignore b/.gitignore index 8a409f4..e85e1fd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +/.idea/ /vendor/ humbuglog.* /bin/phpunit diff --git a/docs/machine-learning/association/apriori.md b/docs/machine-learning/association/apriori.md new file mode 100644 index 0000000..c5986f4 --- /dev/null +++ b/docs/machine-learning/association/apriori.md @@ -0,0 +1,54 @@ +# Apriori Associator + +Association rule learning based on [Apriori algorithm](https://en.wikipedia.org/wiki/Apriori_algorithm) for frequent item set mining. + +### Constructor Parameters + +* $support - [confidence](https://en.wikipedia.org/wiki/Association_rule_learning#Support), minimum relative amount of frequent item set in train sample +* $confidence - [confidence](https://en.wikipedia.org/wiki/Association_rule_learning#Confidence), minimum relative amount of item set in frequent item sets + +``` +$associator = new \Phpml\Association\Apriori($support = 0.5, $confidence = 0.5); +``` + +### Train + +To train a associator simply provide train samples and labels (as `array`). Example: + +``` +$samples = [['alpha', 'beta', 'epsilon'], ['alpha', 'beta', 'theta'], ['alpha', 'beta', 'epsilon'], ['alpha', 'beta', 'theta']]; +$labels = []; + +$associator = new \Phpml\Association\Apriori(0.5, 0.5); +$associator->train($samples, $labels); +``` + +### Predict + +To predict sample label use `predict` method. You can provide one sample or array of samples: + +``` +$associator->predict(['alpha','theta']); +// return [[['beta']]] + +$associator->predict([['alpha','epsilon'],['beta','theta']]); +// return [[['beta']], [['alpha']]] +``` + +### Associating + +Generating association rules simply use `rules` method. + +``` +$associator->rules(); +// return [['antecedent' => ['alpha', 'theta'], 'consequent' => ['beta], 'support' => 1.0, 'confidence' => 1.0], ... ] +``` + +### Frequent item sets + +Generating k-length frequent item sets simply use `apriori` method. + +``` +$associator->apriori(); +// return [ 1 => [['alpha'], ['beta'], ['theta'], ['epsilon']], 2 => [...], ...] +``` diff --git a/src/Phpml/Association/Apriori.php b/src/Phpml/Association/Apriori.php new file mode 100644 index 0000000..bf52c27 --- /dev/null +++ b/src/Phpml/Association/Apriori.php @@ -0,0 +1,325 @@ +support = $support; + $this->confidence = $confidence; + } + + /** + * Generates apriori association rules. + * + * @return mixed[][] + */ + public function rules() + { + if (!$this->large) { + $this->large = $this->apriori(); + } + + if ($this->rules) { + return $this->rules; + } + + $this->rules = []; + + for ($k = 2; !empty($this->large[$k]); ++$k) { + foreach ($this->large[$k] as $frequent) { + foreach ($this->antecedents($frequent) as $antecedent) { + if ($this->confidence <= ($confidence = $this->confidence($frequent, $antecedent))) { + $consequent = array_values(array_diff($frequent, $antecedent)); + $this->rules[] = [ + self::ARRAY_KEY_ANTECEDENT => $antecedent, + self::ARRAY_KEY_CONSEQUENT => $consequent, + self::ARRAY_KEY_SUPPORT => $this->support($consequent), + self::ARRAY_KEY_CONFIDENCE => $confidence, + ]; + } + } + } + } + + return $this->rules; + } + + /** + * Generates frequent item sets + * + * @return mixed[][][] + */ + public function apriori() + { + $L = []; + $L[1] = $this->items(); + $L[1] = $this->frequent($L[1]); + + for ($k = 2; !empty($L[$k - 1]); ++$k) { + $L[$k] = $this->candidates($L[$k - 1]); + $L[$k] = $this->frequent($L[$k]); + } + + return $L; + } + + /** + * @param mixed[] $sample + * + * @return mixed[][] + */ + protected function predictSample(array $sample) + { + $predicts = array_values(array_filter($this->rules(), function($rule) use ($sample) { + return $this->equals($rule[self::ARRAY_KEY_ANTECEDENT], $sample); + })); + + return array_map(function($rule) { return $rule[self::ARRAY_KEY_CONSEQUENT]; }, $predicts); + } + + /** + * Generates the power set for given item set $sample. + * + * @param mixed[] $sample + * + * @return mixed[][] + */ + private function powerSet(array $sample) + { + $results = [[]]; + foreach ($sample as $item) { + foreach ($results as $combination) { + $results[] = array_merge(array($item), $combination); + } + } + + return $results; + } + + /** + * Generates all proper subsets for given set $sample without the empty set. + * + * @param mixed[] $sample + * + * @return mixed[][] + */ + private function antecedents(array $sample) + { + $cardinality = count($sample); + $antecedents = $this->powerSet($sample); + + return array_filter($antecedents, function($antecedent) use ($cardinality) { + return (count($antecedent) != $cardinality) && ($antecedent != []); + }); + } + + /** + * Calculates frequent k = 1 item sets. + * + * @return mixed[][] + */ + private function items() + { + $items = []; + + foreach ($this->samples as $sample) { + foreach ($sample as $item) { + if (!in_array($item, $items, true)) { + $items[] = $item; + } + } + } + + return array_map(function($entry) { + return [$entry]; + }, $items); + } + + /** + * Returns frequent item sets only. + * + * @param mixed[][] $samples + * + * @return mixed[][] + */ + private function frequent(array $samples) + { + return array_filter($samples, function($entry) { + return $this->support($entry) >= $this->support; + }); + } + + /** + * Calculates frequent k item sets, where count($samples) == $k - 1. + * + * @param mixed[][] $samples + * + * @return mixed[][] + */ + private function candidates(array $samples) + { + $candidates = []; + + foreach ($samples as $p) { + foreach ($samples as $q) { + if (count(array_merge(array_diff($p, $q), array_diff($q, $p))) != 2) { + continue; + } + + $candidate = array_unique(array_merge($p, $q)); + + if ($this->contains($candidates, $candidate)) { + continue; + } + + foreach ((array)$this->samples as $sample) { + if ($this->subset($sample, $candidate)) { + $candidates[] = $candidate; + continue 2; + } + } + } + } + + return $candidates; + } + + /** + * Calculates confidence for $set. Confidence is the relative amount of sets containing $subset which also contain + * $set. + * + * @param mixed[] $set + * @param mixed[] $subset + * + * @return float + */ + private function confidence(array $set, array $subset) + { + return $this->support($set) / $this->support($subset); + } + + /** + * Calculates support for item set $sample. Support is the relative amount of sets containing $sample in the data + * pool. + * + * @see \Phpml\Association\Apriori::samples + * + * @param mixed[] $sample + * + * @return float + */ + private function support(array $sample) + { + return $this->frequency($sample) / count($this->samples); + } + + /** + * Counts occurrences of $sample as subset in data pool. + * + * @see \Phpml\Association\Apriori::samples + * + * @param mixed[] $sample + * + * @return int + */ + private function frequency(array $sample) + { + return count(array_filter($this->samples, function($entry) use ($sample) { + return $this->subset($entry, $sample); + })); + } + + /** + * Returns true if set is an element of system. + * + * @see \Phpml\Association\Apriori::equals() + * + * @param mixed[][] $system + * @param mixed[] $set + * + * @return bool + */ + private function contains(array $system, array $set) + { + return (bool)array_filter($system, function($entry) use ($set) { + return $this->equals($entry, $set); + }); + } + + /** + * Returns true if subset is a (proper) subset of set by its items string representation. + * + * @param mixed[] $set + * @param mixed[] $subset + * + * @return bool + */ + private function subset(array $set, array $subset) + { + return !array_diff($subset, array_intersect($subset, $set)); + } + + /** + * Returns true if string representation of items does not differ. + * + * @param mixed[] $set1 + * @param mixed[] $set2 + * + * @return bool + */ + private function equals(array $set1, array $set2) + { + return array_diff($set1, $set2) == array_diff($set2, $set1); + } +} diff --git a/src/Phpml/Association/Associator.php b/src/Phpml/Association/Associator.php new file mode 100644 index 0000000..c339b5e --- /dev/null +++ b/src/Phpml/Association/Associator.php @@ -0,0 +1,11 @@ +train($this->sampleGreek, []); + + $this->assertEquals('beta', $apriori->predict([['alpha', 'epsilon'], ['beta', 'theta']])[0][0][0]); + $this->assertEquals('alpha', $apriori->predict([['alpha', 'epsilon'], ['beta', 'theta']])[1][0][0]); + } + + public function testPowerSet() + { + $apriori = new Apriori(); + + $this->assertCount(8, $this->invoke($apriori, 'powerSet', [['a', 'b', 'c']])); + } + + public function testApriori() + { + $apriori = new Apriori(3 / 7); + $apriori->train($this->sampleBasket, []); + + $L = $apriori->apriori(); + + $this->assertCount(0, $L[3]); + $this->assertCount(4, $L[2]); + $this->assertTrue($this->invoke($apriori, 'contains', [$L[2], [1, 2]])); + $this->assertFalse($this->invoke($apriori, 'contains', [$L[2], [1, 3]])); + $this->assertFalse($this->invoke($apriori, 'contains', [$L[2], [1, 4]])); + $this->assertTrue($this->invoke($apriori, 'contains', [$L[2], [2, 3]])); + $this->assertTrue($this->invoke($apriori, 'contains', [$L[2], [2, 4]])); + $this->assertTrue($this->invoke($apriori, 'contains', [$L[2], [3, 4]])); + } + + public function testRules() + { + $apriori = new Apriori(0.4, 0.8); + $apriori->train($this->sampleChars, []); + + $this->assertCount(19, $apriori->rules()); + } + + public function testAntecedents() + { + $apriori = new Apriori(); + + $this->assertCount(6, $this->invoke($apriori, 'antecedents', [['a', 'b', 'c']])); + } + + public function testItems() + { + $apriori = new Apriori(); + $apriori->train($this->sampleGreek, []); + $this->assertCount(4, $this->invoke($apriori, 'items', [])); + } + + public function testFrequent() + { + $apriori = new Apriori(0.51); + $apriori->train($this->sampleGreek, []); + + $this->assertCount(0, $this->invoke($apriori, 'frequent', [[['epsilon'], ['theta']]])); + $this->assertCount(2, $this->invoke($apriori, 'frequent', [[['alpha'], ['beta']]])); + } + + public function testCandidates() + { + $apriori = new Apriori(); + $apriori->train($this->sampleGreek, []); + + $this->assertArraySubset([0 => ['alpha', 'beta']], $this->invoke($apriori, 'candidates', [[['alpha'], ['beta'], ['theta']]])); + $this->assertArraySubset([1 => ['alpha', 'theta']], $this->invoke($apriori, 'candidates', [[['alpha'], ['beta'], ['theta']]])); + $this->assertArraySubset([2 => ['beta', 'theta']], $this->invoke($apriori, 'candidates', [[['alpha'], ['beta'], ['theta']]])); + $this->assertCount(3, $this->invoke($apriori, 'candidates', [[['alpha'], ['beta'], ['theta']]])); + } + + public function testConfidence() + { + $apriori = new Apriori(); + $apriori->train($this->sampleGreek, []); + + $this->assertEquals(0.5, $this->invoke($apriori, 'confidence', [['alpha', 'beta', 'theta'], ['alpha', 'beta']])); + $this->assertEquals(1, $this->invoke($apriori, 'confidence', [['alpha', 'beta'], ['alpha']])); + } + + public function testSupport() + { + $apriori = new Apriori(); + $apriori->train($this->sampleGreek, []); + + $this->assertEquals(1.0, $this->invoke($apriori, 'support', [['alpha', 'beta']])); + $this->assertEquals(0.5, $this->invoke($apriori, 'support', [['epsilon']])); + } + + public function testFrequency() + { + $apriori = new Apriori(); + $apriori->train($this->sampleGreek, []); + + $this->assertEquals(4, $this->invoke($apriori, 'frequency', [['alpha', 'beta']])); + $this->assertEquals(2, $this->invoke($apriori, 'frequency', [['epsilon']])); + } + + public function testContains() + { + $apriori = new Apriori(); + + $this->assertTrue($this->invoke($apriori, 'contains', [[['a'], ['b']], ['a']])); + $this->assertTrue($this->invoke($apriori, 'contains', [[[1, 2]], [1, 2]])); + $this->assertFalse($this->invoke($apriori, 'contains', [[['a'], ['b']], ['c']])); + } + + public function testSubset() + { + $apriori = new Apriori(); + + $this->assertTrue($this->invoke($apriori, 'subset', [['a', 'b'], ['a']])); + $this->assertTrue($this->invoke($apriori, 'subset', [['a'], ['a']])); + $this->assertFalse($this->invoke($apriori, 'subset', [['a'], ['a', 'b']])); + } + + public function testEquals() + { + $apriori = new Apriori(); + + $this->assertTrue($this->invoke($apriori, 'equals', [['a'], ['a']])); + $this->assertFalse($this->invoke($apriori, 'equals', [['a'], []])); + $this->assertFalse($this->invoke($apriori, 'equals', [['a'], ['b', 'a']])); + } + + /** + * Invokes objects method. Private/protected will be set accessible. + * + * @param object &$object Instantiated object to be called on + * @param string $method Method name to be called + * @param array $params Array of params to be passed + * + * @return mixed + */ + public function invoke(&$object, $method, array $params = array()) + { + $reflection = new \ReflectionClass(get_class($object)); + $method = $reflection->getMethod($method); + $method->setAccessible(true); + + return $method->invokeArgs($object, $params); + } +}