mirror of
https://github.com/Llewellynvdm/php-ml.git
synced 2024-11-24 22:07:33 +00:00
# Association rule learning - Apriori algorithm
* Generating frequent k-length item sets * Generating rules based on frequent item sets * Algorithm has exponential complexity, be aware of it * Apriori algorithm is split into apriori and candidates method * Second step rule generation is implemented by rules method * Internal methods are invoked for fine grain unit tests * Wikipedia's train samples and an alternative are provided for test cases * Small documentation for public interface is also shipped
This commit is contained in:
parent
6421a2ba41
commit
c8bd8db601
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,3 +1,4 @@
|
||||
/.idea/
|
||||
/vendor/
|
||||
humbuglog.*
|
||||
/bin/phpunit
|
||||
|
54
docs/machine-learning/association/apriori.md
Normal file
54
docs/machine-learning/association/apriori.md
Normal file
@ -0,0 +1,54 @@
|
||||
# Apriori Associator
|
||||
|
||||
Association rule learning based on [Apriori algorithm](https://en.wikipedia.org/wiki/Apriori_algorithm) for frequent item set mining.
|
||||
|
||||
### Constructor Parameters
|
||||
|
||||
* $support - [confidence](https://en.wikipedia.org/wiki/Association_rule_learning#Support), minimum relative amount of frequent item set in train sample
|
||||
* $confidence - [confidence](https://en.wikipedia.org/wiki/Association_rule_learning#Confidence), minimum relative amount of item set in frequent item sets
|
||||
|
||||
```
|
||||
$associator = new \Phpml\Association\Apriori($support = 0.5, $confidence = 0.5);
|
||||
```
|
||||
|
||||
### Train
|
||||
|
||||
To train a associator simply provide train samples and labels (as `array`). Example:
|
||||
|
||||
```
|
||||
$samples = [['alpha', 'beta', 'epsilon'], ['alpha', 'beta', 'theta'], ['alpha', 'beta', 'epsilon'], ['alpha', 'beta', 'theta']];
|
||||
$labels = [];
|
||||
|
||||
$associator = new \Phpml\Association\Apriori(0.5, 0.5);
|
||||
$associator->train($samples, $labels);
|
||||
```
|
||||
|
||||
### Predict
|
||||
|
||||
To predict sample label use `predict` method. You can provide one sample or array of samples:
|
||||
|
||||
```
|
||||
$associator->predict(['alpha','theta']);
|
||||
// return [[['beta']]]
|
||||
|
||||
$associator->predict([['alpha','epsilon'],['beta','theta']]);
|
||||
// return [[['beta']], [['alpha']]]
|
||||
```
|
||||
|
||||
### Associating
|
||||
|
||||
Generating association rules simply use `rules` method.
|
||||
|
||||
```
|
||||
$associator->rules();
|
||||
// return [['antecedent' => ['alpha', 'theta'], 'consequent' => ['beta], 'support' => 1.0, 'confidence' => 1.0], ... ]
|
||||
```
|
||||
|
||||
### Frequent item sets
|
||||
|
||||
Generating k-length frequent item sets simply use `apriori` method.
|
||||
|
||||
```
|
||||
$associator->apriori();
|
||||
// return [ 1 => [['alpha'], ['beta'], ['theta'], ['epsilon']], 2 => [...], ...]
|
||||
```
|
325
src/Phpml/Association/Apriori.php
Normal file
325
src/Phpml/Association/Apriori.php
Normal file
@ -0,0 +1,325 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types = 1);
|
||||
|
||||
namespace Phpml\Association;
|
||||
|
||||
use Phpml\Helper\Predictable;
|
||||
use Phpml\Helper\Trainable;
|
||||
|
||||
class Apriori implements Associator
|
||||
{
|
||||
use Trainable, Predictable;
|
||||
|
||||
const ARRAY_KEY_ANTECEDENT = 'antecedent';
|
||||
|
||||
const ARRAY_KEY_CONFIDENCE = 'confidence';
|
||||
|
||||
const ARRAY_KEY_CONSEQUENT = 'consequent';
|
||||
|
||||
const ARRAY_KEY_SUPPORT = 'support';
|
||||
|
||||
/**
|
||||
* Minimum relative probability of frequent transactions.
|
||||
*
|
||||
* @var float
|
||||
*/
|
||||
private $confidence;
|
||||
|
||||
/**
|
||||
* The large set contains frequent k-length item sets.
|
||||
*
|
||||
* @var mixed[][][]
|
||||
*/
|
||||
private $large;
|
||||
|
||||
/**
|
||||
* Minimum relative frequency of transactions.
|
||||
*
|
||||
* @var float
|
||||
*/
|
||||
private $support;
|
||||
|
||||
/**
|
||||
* The generated Apriori association rules.
|
||||
*
|
||||
* @var mixed[][]
|
||||
*/
|
||||
private $rules;
|
||||
|
||||
/**
|
||||
* Apriori constructor.
|
||||
*
|
||||
* @param float $support
|
||||
* @param float $confidence
|
||||
*/
|
||||
public function __construct($support = 0.0, $confidence = 0.0)
|
||||
{
|
||||
$this->support = $support;
|
||||
$this->confidence = $confidence;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates apriori association rules.
|
||||
*
|
||||
* @return mixed[][]
|
||||
*/
|
||||
public function rules()
|
||||
{
|
||||
if (!$this->large) {
|
||||
$this->large = $this->apriori();
|
||||
}
|
||||
|
||||
if ($this->rules) {
|
||||
return $this->rules;
|
||||
}
|
||||
|
||||
$this->rules = [];
|
||||
|
||||
for ($k = 2; !empty($this->large[$k]); ++$k) {
|
||||
foreach ($this->large[$k] as $frequent) {
|
||||
foreach ($this->antecedents($frequent) as $antecedent) {
|
||||
if ($this->confidence <= ($confidence = $this->confidence($frequent, $antecedent))) {
|
||||
$consequent = array_values(array_diff($frequent, $antecedent));
|
||||
$this->rules[] = [
|
||||
self::ARRAY_KEY_ANTECEDENT => $antecedent,
|
||||
self::ARRAY_KEY_CONSEQUENT => $consequent,
|
||||
self::ARRAY_KEY_SUPPORT => $this->support($consequent),
|
||||
self::ARRAY_KEY_CONFIDENCE => $confidence,
|
||||
];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $this->rules;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates frequent item sets
|
||||
*
|
||||
* @return mixed[][][]
|
||||
*/
|
||||
public function apriori()
|
||||
{
|
||||
$L = [];
|
||||
$L[1] = $this->items();
|
||||
$L[1] = $this->frequent($L[1]);
|
||||
|
||||
for ($k = 2; !empty($L[$k - 1]); ++$k) {
|
||||
$L[$k] = $this->candidates($L[$k - 1]);
|
||||
$L[$k] = $this->frequent($L[$k]);
|
||||
}
|
||||
|
||||
return $L;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param mixed[] $sample
|
||||
*
|
||||
* @return mixed[][]
|
||||
*/
|
||||
protected function predictSample(array $sample)
|
||||
{
|
||||
$predicts = array_values(array_filter($this->rules(), function($rule) use ($sample) {
|
||||
return $this->equals($rule[self::ARRAY_KEY_ANTECEDENT], $sample);
|
||||
}));
|
||||
|
||||
return array_map(function($rule) { return $rule[self::ARRAY_KEY_CONSEQUENT]; }, $predicts);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates the power set for given item set $sample.
|
||||
*
|
||||
* @param mixed[] $sample
|
||||
*
|
||||
* @return mixed[][]
|
||||
*/
|
||||
private function powerSet(array $sample)
|
||||
{
|
||||
$results = [[]];
|
||||
foreach ($sample as $item) {
|
||||
foreach ($results as $combination) {
|
||||
$results[] = array_merge(array($item), $combination);
|
||||
}
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates all proper subsets for given set $sample without the empty set.
|
||||
*
|
||||
* @param mixed[] $sample
|
||||
*
|
||||
* @return mixed[][]
|
||||
*/
|
||||
private function antecedents(array $sample)
|
||||
{
|
||||
$cardinality = count($sample);
|
||||
$antecedents = $this->powerSet($sample);
|
||||
|
||||
return array_filter($antecedents, function($antecedent) use ($cardinality) {
|
||||
return (count($antecedent) != $cardinality) && ($antecedent != []);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates frequent k = 1 item sets.
|
||||
*
|
||||
* @return mixed[][]
|
||||
*/
|
||||
private function items()
|
||||
{
|
||||
$items = [];
|
||||
|
||||
foreach ($this->samples as $sample) {
|
||||
foreach ($sample as $item) {
|
||||
if (!in_array($item, $items, true)) {
|
||||
$items[] = $item;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return array_map(function($entry) {
|
||||
return [$entry];
|
||||
}, $items);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns frequent item sets only.
|
||||
*
|
||||
* @param mixed[][] $samples
|
||||
*
|
||||
* @return mixed[][]
|
||||
*/
|
||||
private function frequent(array $samples)
|
||||
{
|
||||
return array_filter($samples, function($entry) {
|
||||
return $this->support($entry) >= $this->support;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates frequent k item sets, where count($samples) == $k - 1.
|
||||
*
|
||||
* @param mixed[][] $samples
|
||||
*
|
||||
* @return mixed[][]
|
||||
*/
|
||||
private function candidates(array $samples)
|
||||
{
|
||||
$candidates = [];
|
||||
|
||||
foreach ($samples as $p) {
|
||||
foreach ($samples as $q) {
|
||||
if (count(array_merge(array_diff($p, $q), array_diff($q, $p))) != 2) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$candidate = array_unique(array_merge($p, $q));
|
||||
|
||||
if ($this->contains($candidates, $candidate)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach ((array)$this->samples as $sample) {
|
||||
if ($this->subset($sample, $candidate)) {
|
||||
$candidates[] = $candidate;
|
||||
continue 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $candidates;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates confidence for $set. Confidence is the relative amount of sets containing $subset which also contain
|
||||
* $set.
|
||||
*
|
||||
* @param mixed[] $set
|
||||
* @param mixed[] $subset
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
private function confidence(array $set, array $subset)
|
||||
{
|
||||
return $this->support($set) / $this->support($subset);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates support for item set $sample. Support is the relative amount of sets containing $sample in the data
|
||||
* pool.
|
||||
*
|
||||
* @see \Phpml\Association\Apriori::samples
|
||||
*
|
||||
* @param mixed[] $sample
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
private function support(array $sample)
|
||||
{
|
||||
return $this->frequency($sample) / count($this->samples);
|
||||
}
|
||||
|
||||
/**
|
||||
* Counts occurrences of $sample as subset in data pool.
|
||||
*
|
||||
* @see \Phpml\Association\Apriori::samples
|
||||
*
|
||||
* @param mixed[] $sample
|
||||
*
|
||||
* @return int
|
||||
*/
|
||||
private function frequency(array $sample)
|
||||
{
|
||||
return count(array_filter($this->samples, function($entry) use ($sample) {
|
||||
return $this->subset($entry, $sample);
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if set is an element of system.
|
||||
*
|
||||
* @see \Phpml\Association\Apriori::equals()
|
||||
*
|
||||
* @param mixed[][] $system
|
||||
* @param mixed[] $set
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
private function contains(array $system, array $set)
|
||||
{
|
||||
return (bool)array_filter($system, function($entry) use ($set) {
|
||||
return $this->equals($entry, $set);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if subset is a (proper) subset of set by its items string representation.
|
||||
*
|
||||
* @param mixed[] $set
|
||||
* @param mixed[] $subset
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
private function subset(array $set, array $subset)
|
||||
{
|
||||
return !array_diff($subset, array_intersect($subset, $set));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if string representation of items does not differ.
|
||||
*
|
||||
* @param mixed[] $set1
|
||||
* @param mixed[] $set2
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
private function equals(array $set1, array $set2)
|
||||
{
|
||||
return array_diff($set1, $set2) == array_diff($set2, $set1);
|
||||
}
|
||||
}
|
11
src/Phpml/Association/Associator.php
Normal file
11
src/Phpml/Association/Associator.php
Normal file
@ -0,0 +1,11 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Phpml\Association;
|
||||
|
||||
use Phpml\Estimator;
|
||||
|
||||
interface Associator extends Estimator
|
||||
{
|
||||
}
|
187
tests/Phpml/Association/AprioriTest.php
Normal file
187
tests/Phpml/Association/AprioriTest.php
Normal file
@ -0,0 +1,187 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types = 1);
|
||||
|
||||
namespace tests\Classification;
|
||||
|
||||
use Phpml\Association\Apriori;
|
||||
|
||||
class AprioriTest extends \PHPUnit_Framework_TestCase
|
||||
{
|
||||
private $sampleGreek = [
|
||||
['alpha', 'beta', 'epsilon'],
|
||||
['alpha', 'beta', 'theta'],
|
||||
['alpha', 'beta', 'epsilon'],
|
||||
['alpha', 'beta', 'theta'],
|
||||
];
|
||||
|
||||
private $sampleChars = [
|
||||
['E', 'D', 'N', 'E+N', 'EN'],
|
||||
['E', 'R', 'N', 'E+R', 'E+N', 'ER', 'EN'],
|
||||
['D', 'R'],
|
||||
['E', 'D', 'N', 'E+N'],
|
||||
['E', 'R', 'N', 'E+R', 'E+N', 'ER'],
|
||||
['E', 'D', 'R', 'E+R', 'ER'],
|
||||
['E', 'D', 'N', 'E+N', 'EN'],
|
||||
['E', 'R', 'E+R'],
|
||||
['E'],
|
||||
['N',],
|
||||
];
|
||||
|
||||
private $sampleBasket = [
|
||||
[1, 2, 3, 4],
|
||||
[1, 2, 4],
|
||||
[1, 2],
|
||||
[2, 3, 4],
|
||||
[2, 3],
|
||||
[3, 4],
|
||||
[2, 4],
|
||||
];
|
||||
|
||||
public function testGreek()
|
||||
{
|
||||
$apriori = new Apriori(0.5, 0.5);
|
||||
$apriori->train($this->sampleGreek, []);
|
||||
|
||||
$this->assertEquals('beta', $apriori->predict([['alpha', 'epsilon'], ['beta', 'theta']])[0][0][0]);
|
||||
$this->assertEquals('alpha', $apriori->predict([['alpha', 'epsilon'], ['beta', 'theta']])[1][0][0]);
|
||||
}
|
||||
|
||||
public function testPowerSet()
|
||||
{
|
||||
$apriori = new Apriori();
|
||||
|
||||
$this->assertCount(8, $this->invoke($apriori, 'powerSet', [['a', 'b', 'c']]));
|
||||
}
|
||||
|
||||
public function testApriori()
|
||||
{
|
||||
$apriori = new Apriori(3 / 7);
|
||||
$apriori->train($this->sampleBasket, []);
|
||||
|
||||
$L = $apriori->apriori();
|
||||
|
||||
$this->assertCount(0, $L[3]);
|
||||
$this->assertCount(4, $L[2]);
|
||||
$this->assertTrue($this->invoke($apriori, 'contains', [$L[2], [1, 2]]));
|
||||
$this->assertFalse($this->invoke($apriori, 'contains', [$L[2], [1, 3]]));
|
||||
$this->assertFalse($this->invoke($apriori, 'contains', [$L[2], [1, 4]]));
|
||||
$this->assertTrue($this->invoke($apriori, 'contains', [$L[2], [2, 3]]));
|
||||
$this->assertTrue($this->invoke($apriori, 'contains', [$L[2], [2, 4]]));
|
||||
$this->assertTrue($this->invoke($apriori, 'contains', [$L[2], [3, 4]]));
|
||||
}
|
||||
|
||||
public function testRules()
|
||||
{
|
||||
$apriori = new Apriori(0.4, 0.8);
|
||||
$apriori->train($this->sampleChars, []);
|
||||
|
||||
$this->assertCount(19, $apriori->rules());
|
||||
}
|
||||
|
||||
public function testAntecedents()
|
||||
{
|
||||
$apriori = new Apriori();
|
||||
|
||||
$this->assertCount(6, $this->invoke($apriori, 'antecedents', [['a', 'b', 'c']]));
|
||||
}
|
||||
|
||||
public function testItems()
|
||||
{
|
||||
$apriori = new Apriori();
|
||||
$apriori->train($this->sampleGreek, []);
|
||||
$this->assertCount(4, $this->invoke($apriori, 'items', []));
|
||||
}
|
||||
|
||||
public function testFrequent()
|
||||
{
|
||||
$apriori = new Apriori(0.51);
|
||||
$apriori->train($this->sampleGreek, []);
|
||||
|
||||
$this->assertCount(0, $this->invoke($apriori, 'frequent', [[['epsilon'], ['theta']]]));
|
||||
$this->assertCount(2, $this->invoke($apriori, 'frequent', [[['alpha'], ['beta']]]));
|
||||
}
|
||||
|
||||
public function testCandidates()
|
||||
{
|
||||
$apriori = new Apriori();
|
||||
$apriori->train($this->sampleGreek, []);
|
||||
|
||||
$this->assertArraySubset([0 => ['alpha', 'beta']], $this->invoke($apriori, 'candidates', [[['alpha'], ['beta'], ['theta']]]));
|
||||
$this->assertArraySubset([1 => ['alpha', 'theta']], $this->invoke($apriori, 'candidates', [[['alpha'], ['beta'], ['theta']]]));
|
||||
$this->assertArraySubset([2 => ['beta', 'theta']], $this->invoke($apriori, 'candidates', [[['alpha'], ['beta'], ['theta']]]));
|
||||
$this->assertCount(3, $this->invoke($apriori, 'candidates', [[['alpha'], ['beta'], ['theta']]]));
|
||||
}
|
||||
|
||||
public function testConfidence()
|
||||
{
|
||||
$apriori = new Apriori();
|
||||
$apriori->train($this->sampleGreek, []);
|
||||
|
||||
$this->assertEquals(0.5, $this->invoke($apriori, 'confidence', [['alpha', 'beta', 'theta'], ['alpha', 'beta']]));
|
||||
$this->assertEquals(1, $this->invoke($apriori, 'confidence', [['alpha', 'beta'], ['alpha']]));
|
||||
}
|
||||
|
||||
public function testSupport()
|
||||
{
|
||||
$apriori = new Apriori();
|
||||
$apriori->train($this->sampleGreek, []);
|
||||
|
||||
$this->assertEquals(1.0, $this->invoke($apriori, 'support', [['alpha', 'beta']]));
|
||||
$this->assertEquals(0.5, $this->invoke($apriori, 'support', [['epsilon']]));
|
||||
}
|
||||
|
||||
public function testFrequency()
|
||||
{
|
||||
$apriori = new Apriori();
|
||||
$apriori->train($this->sampleGreek, []);
|
||||
|
||||
$this->assertEquals(4, $this->invoke($apriori, 'frequency', [['alpha', 'beta']]));
|
||||
$this->assertEquals(2, $this->invoke($apriori, 'frequency', [['epsilon']]));
|
||||
}
|
||||
|
||||
public function testContains()
|
||||
{
|
||||
$apriori = new Apriori();
|
||||
|
||||
$this->assertTrue($this->invoke($apriori, 'contains', [[['a'], ['b']], ['a']]));
|
||||
$this->assertTrue($this->invoke($apriori, 'contains', [[[1, 2]], [1, 2]]));
|
||||
$this->assertFalse($this->invoke($apriori, 'contains', [[['a'], ['b']], ['c']]));
|
||||
}
|
||||
|
||||
public function testSubset()
|
||||
{
|
||||
$apriori = new Apriori();
|
||||
|
||||
$this->assertTrue($this->invoke($apriori, 'subset', [['a', 'b'], ['a']]));
|
||||
$this->assertTrue($this->invoke($apriori, 'subset', [['a'], ['a']]));
|
||||
$this->assertFalse($this->invoke($apriori, 'subset', [['a'], ['a', 'b']]));
|
||||
}
|
||||
|
||||
public function testEquals()
|
||||
{
|
||||
$apriori = new Apriori();
|
||||
|
||||
$this->assertTrue($this->invoke($apriori, 'equals', [['a'], ['a']]));
|
||||
$this->assertFalse($this->invoke($apriori, 'equals', [['a'], []]));
|
||||
$this->assertFalse($this->invoke($apriori, 'equals', [['a'], ['b', 'a']]));
|
||||
}
|
||||
|
||||
/**
|
||||
* Invokes objects method. Private/protected will be set accessible.
|
||||
*
|
||||
* @param object &$object Instantiated object to be called on
|
||||
* @param string $method Method name to be called
|
||||
* @param array $params Array of params to be passed
|
||||
*
|
||||
* @return mixed
|
||||
*/
|
||||
public function invoke(&$object, $method, array $params = array())
|
||||
{
|
||||
$reflection = new \ReflectionClass(get_class($object));
|
||||
$method = $reflection->getMethod($method);
|
||||
$method->setAccessible(true);
|
||||
|
||||
return $method->invokeArgs($object, $params);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user