From c104e602010f516cd81948817130a68138265b14 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sat, 9 Apr 2016 00:53:18 +0200 Subject: [PATCH 01/59] fix docs link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index feac3e5..e2fc4b3 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Fresh approach to machine learning in PHP. Note that at the moment PHP is not th ## Documentation -To find out how to use PHP-ML follow [Documentation](php-ml.readthedocs.org). +To find out how to use PHP-ML follow [Documentation](http://php-ml.readthedocs.org/). ## Installation From dd5358130995fb7513f98024ceaecc8c1a1ad8c0 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sat, 9 Apr 2016 15:33:05 +0200 Subject: [PATCH 02/59] wine class dataset --- data/wine.csv | 179 ++++++++++++++++++ src/Phpml/Dataset/Demo/Wine.php | 22 +++ src/Phpml/Metric/Accuracy.php | 2 +- .../Classifier/KNearestNeighborsTest.php | 13 ++ tests/Phpml/Dataset/Demo/WineTest.php | 22 +++ 5 files changed, 237 insertions(+), 1 deletion(-) create mode 100644 data/wine.csv create mode 100644 src/Phpml/Dataset/Demo/Wine.php create mode 100644 tests/Phpml/Dataset/Demo/WineTest.php diff --git a/data/wine.csv b/data/wine.csv new file mode 100644 index 0000000..fe00ec1 --- /dev/null +++ b/data/wine.csv @@ -0,0 +1,179 @@ +alcohol,malic acid,ash,alcalinity of ash,magnesium,total phenols,flavanoids,nonflavanoid phenols,proanthocyanins,color intensity,hue,OD280/OD315 of diluted wines,proline,class +14.23,1.71,2.43,15.6,127,2.8,3.06,.28,2.29,5.64,1.04,3.92,1065,1 +13.2,1.78,2.14,11.2,100,2.65,2.76,.26,1.28,4.38,1.05,3.4,1050,1 +13.16,2.36,2.67,18.6,101,2.8,3.24,.3,2.81,5.68,1.03,3.17,1185,1 +14.37,1.95,2.5,16.8,113,3.85,3.49,.24,2.18,7.8,.86,3.45,1480,1 +13.24,2.59,2.87,21,118,2.8,2.69,.39,1.82,4.32,1.04,2.93,735,1 +14.2,1.76,2.45,15.2,112,3.27,3.39,.34,1.97,6.75,1.05,2.85,1450,1 +14.39,1.87,2.45,14.6,96,2.5,2.52,.3,1.98,5.25,1.02,3.58,1290,1 +14.06,2.15,2.61,17.6,121,2.6,2.51,.31,1.25,5.05,1.06,3.58,1295,1 +14.83,1.64,2.17,14,97,2.8,2.98,.29,1.98,5.2,1.08,2.85,1045,1 +13.86,1.35,2.27,16,98,2.98,3.15,.22,1.85,7.22,1.01,3.55,1045,1 +14.1,2.16,2.3,18,105,2.95,3.32,.22,2.38,5.75,1.25,3.17,1510,1 +14.12,1.48,2.32,16.8,95,2.2,2.43,.26,1.57,5,1.17,2.82,1280,1 +13.75,1.73,2.41,16,89,2.6,2.76,.29,1.81,5.6,1.15,2.9,1320,1 +14.75,1.73,2.39,11.4,91,3.1,3.69,.43,2.81,5.4,1.25,2.73,1150,1 +14.38,1.87,2.38,12,102,3.3,3.64,.29,2.96,7.5,1.2,3,1547,1 +13.63,1.81,2.7,17.2,112,2.85,2.91,.3,1.46,7.3,1.28,2.88,1310,1 +14.3,1.92,2.72,20,120,2.8,3.14,.33,1.97,6.2,1.07,2.65,1280,1 +13.83,1.57,2.62,20,115,2.95,3.4,.4,1.72,6.6,1.13,2.57,1130,1 +14.19,1.59,2.48,16.5,108,3.3,3.93,.32,1.86,8.7,1.23,2.82,1680,1 +13.64,3.1,2.56,15.2,116,2.7,3.03,.17,1.66,5.1,.96,3.36,845,1 +14.06,1.63,2.28,16,126,3,3.17,.24,2.1,5.65,1.09,3.71,780,1 +12.93,3.8,2.65,18.6,102,2.41,2.41,.25,1.98,4.5,1.03,3.52,770,1 +13.71,1.86,2.36,16.6,101,2.61,2.88,.27,1.69,3.8,1.11,4,1035,1 +12.85,1.6,2.52,17.8,95,2.48,2.37,.26,1.46,3.93,1.09,3.63,1015,1 +13.5,1.81,2.61,20,96,2.53,2.61,.28,1.66,3.52,1.12,3.82,845,1 +13.05,2.05,3.22,25,124,2.63,2.68,.47,1.92,3.58,1.13,3.2,830,1 +13.39,1.77,2.62,16.1,93,2.85,2.94,.34,1.45,4.8,.92,3.22,1195,1 +13.3,1.72,2.14,17,94,2.4,2.19,.27,1.35,3.95,1.02,2.77,1285,1 +13.87,1.9,2.8,19.4,107,2.95,2.97,.37,1.76,4.5,1.25,3.4,915,1 +14.02,1.68,2.21,16,96,2.65,2.33,.26,1.98,4.7,1.04,3.59,1035,1 +13.73,1.5,2.7,22.5,101,3,3.25,.29,2.38,5.7,1.19,2.71,1285,1 +13.58,1.66,2.36,19.1,106,2.86,3.19,.22,1.95,6.9,1.09,2.88,1515,1 +13.68,1.83,2.36,17.2,104,2.42,2.69,.42,1.97,3.84,1.23,2.87,990,1 +13.76,1.53,2.7,19.5,132,2.95,2.74,.5,1.35,5.4,1.25,3,1235,1 +13.51,1.8,2.65,19,110,2.35,2.53,.29,1.54,4.2,1.1,2.87,1095,1 +13.48,1.81,2.41,20.5,100,2.7,2.98,.26,1.86,5.1,1.04,3.47,920,1 +13.28,1.64,2.84,15.5,110,2.6,2.68,.34,1.36,4.6,1.09,2.78,880,1 +13.05,1.65,2.55,18,98,2.45,2.43,.29,1.44,4.25,1.12,2.51,1105,1 +13.07,1.5,2.1,15.5,98,2.4,2.64,.28,1.37,3.7,1.18,2.69,1020,1 +14.22,3.99,2.51,13.2,128,3,3.04,.2,2.08,5.1,.89,3.53,760,1 +13.56,1.71,2.31,16.2,117,3.15,3.29,.34,2.34,6.13,.95,3.38,795,1 +13.41,3.84,2.12,18.8,90,2.45,2.68,.27,1.48,4.28,.91,3,1035,1 +13.88,1.89,2.59,15,101,3.25,3.56,.17,1.7,5.43,.88,3.56,1095,1 +13.24,3.98,2.29,17.5,103,2.64,2.63,.32,1.66,4.36,.82,3,680,1 +13.05,1.77,2.1,17,107,3,3,.28,2.03,5.04,.88,3.35,885,1 +14.21,4.04,2.44,18.9,111,2.85,2.65,.3,1.25,5.24,.87,3.33,1080,1 +14.38,3.59,2.28,16,102,3.25,3.17,.27,2.19,4.9,1.04,3.44,1065,1 +13.9,1.68,2.12,16,101,3.1,3.39,.21,2.14,6.1,.91,3.33,985,1 +14.1,2.02,2.4,18.8,103,2.75,2.92,.32,2.38,6.2,1.07,2.75,1060,1 +13.94,1.73,2.27,17.4,108,2.88,3.54,.32,2.08,8.90,1.12,3.1,1260,1 +13.05,1.73,2.04,12.4,92,2.72,3.27,.17,2.91,7.2,1.12,2.91,1150,1 +13.83,1.65,2.6,17.2,94,2.45,2.99,.22,2.29,5.6,1.24,3.37,1265,1 +13.82,1.75,2.42,14,111,3.88,3.74,.32,1.87,7.05,1.01,3.26,1190,1 +13.77,1.9,2.68,17.1,115,3,2.79,.39,1.68,6.3,1.13,2.93,1375,1 +13.74,1.67,2.25,16.4,118,2.6,2.9,.21,1.62,5.85,.92,3.2,1060,1 +13.56,1.73,2.46,20.5,116,2.96,2.78,.2,2.45,6.25,.98,3.03,1120,1 +14.22,1.7,2.3,16.3,118,3.2,3,.26,2.03,6.38,.94,3.31,970,1 +13.29,1.97,2.68,16.8,102,3,3.23,.31,1.66,6,1.07,2.84,1270,1 +13.72,1.43,2.5,16.7,108,3.4,3.67,.19,2.04,6.8,.89,2.87,1285,1 +12.37,.94,1.36,10.6,88,1.98,.57,.28,.42,1.95,1.05,1.82,520,2 +12.33,1.1,2.28,16,101,2.05,1.09,.63,.41,3.27,1.25,1.67,680,2 +12.64,1.36,2.02,16.8,100,2.02,1.41,.53,.62,5.75,.98,1.59,450,2 +13.67,1.25,1.92,18,94,2.1,1.79,.32,.73,3.8,1.23,2.46,630,2 +12.37,1.13,2.16,19,87,3.5,3.1,.19,1.87,4.45,1.22,2.87,420,2 +12.17,1.45,2.53,19,104,1.89,1.75,.45,1.03,2.95,1.45,2.23,355,2 +12.37,1.21,2.56,18.1,98,2.42,2.65,.37,2.08,4.6,1.19,2.3,678,2 +13.11,1.01,1.7,15,78,2.98,3.18,.26,2.28,5.3,1.12,3.18,502,2 +12.37,1.17,1.92,19.6,78,2.11,2,.27,1.04,4.68,1.12,3.48,510,2 +13.34,.94,2.36,17,110,2.53,1.3,.55,.42,3.17,1.02,1.93,750,2 +12.21,1.19,1.75,16.8,151,1.85,1.28,.14,2.5,2.85,1.28,3.07,718,2 +12.29,1.61,2.21,20.4,103,1.1,1.02,.37,1.46,3.05,.906,1.82,870,2 +13.86,1.51,2.67,25,86,2.95,2.86,.21,1.87,3.38,1.36,3.16,410,2 +13.49,1.66,2.24,24,87,1.88,1.84,.27,1.03,3.74,.98,2.78,472,2 +12.99,1.67,2.6,30,139,3.3,2.89,.21,1.96,3.35,1.31,3.5,985,2 +11.96,1.09,2.3,21,101,3.38,2.14,.13,1.65,3.21,.99,3.13,886,2 +11.66,1.88,1.92,16,97,1.61,1.57,.34,1.15,3.8,1.23,2.14,428,2 +13.03,.9,1.71,16,86,1.95,2.03,.24,1.46,4.6,1.19,2.48,392,2 +11.84,2.89,2.23,18,112,1.72,1.32,.43,.95,2.65,.96,2.52,500,2 +12.33,.99,1.95,14.8,136,1.9,1.85,.35,2.76,3.4,1.06,2.31,750,2 +12.7,3.87,2.4,23,101,2.83,2.55,.43,1.95,2.57,1.19,3.13,463,2 +12,.92,2,19,86,2.42,2.26,.3,1.43,2.5,1.38,3.12,278,2 +12.72,1.81,2.2,18.8,86,2.2,2.53,.26,1.77,3.9,1.16,3.14,714,2 +12.08,1.13,2.51,24,78,2,1.58,.4,1.4,2.2,1.31,2.72,630,2 +13.05,3.86,2.32,22.5,85,1.65,1.59,.61,1.62,4.8,.84,2.01,515,2 +11.84,.89,2.58,18,94,2.2,2.21,.22,2.35,3.05,.79,3.08,520,2 +12.67,.98,2.24,18,99,2.2,1.94,.3,1.46,2.62,1.23,3.16,450,2 +12.16,1.61,2.31,22.8,90,1.78,1.69,.43,1.56,2.45,1.33,2.26,495,2 +11.65,1.67,2.62,26,88,1.92,1.61,.4,1.34,2.6,1.36,3.21,562,2 +11.64,2.06,2.46,21.6,84,1.95,1.69,.48,1.35,2.8,1,2.75,680,2 +12.08,1.33,2.3,23.6,70,2.2,1.59,.42,1.38,1.74,1.07,3.21,625,2 +12.08,1.83,2.32,18.5,81,1.6,1.5,.52,1.64,2.4,1.08,2.27,480,2 +12,1.51,2.42,22,86,1.45,1.25,.5,1.63,3.6,1.05,2.65,450,2 +12.69,1.53,2.26,20.7,80,1.38,1.46,.58,1.62,3.05,.96,2.06,495,2 +12.29,2.83,2.22,18,88,2.45,2.25,.25,1.99,2.15,1.15,3.3,290,2 +11.62,1.99,2.28,18,98,3.02,2.26,.17,1.35,3.25,1.16,2.96,345,2 +12.47,1.52,2.2,19,162,2.5,2.27,.32,3.28,2.6,1.16,2.63,937,2 +11.81,2.12,2.74,21.5,134,1.6,.99,.14,1.56,2.5,.95,2.26,625,2 +12.29,1.41,1.98,16,85,2.55,2.5,.29,1.77,2.9,1.23,2.74,428,2 +12.37,1.07,2.1,18.5,88,3.52,3.75,.24,1.95,4.5,1.04,2.77,660,2 +12.29,3.17,2.21,18,88,2.85,2.99,.45,2.81,2.3,1.42,2.83,406,2 +12.08,2.08,1.7,17.5,97,2.23,2.17,.26,1.4,3.3,1.27,2.96,710,2 +12.6,1.34,1.9,18.5,88,1.45,1.36,.29,1.35,2.45,1.04,2.77,562,2 +12.34,2.45,2.46,21,98,2.56,2.11,.34,1.31,2.8,.8,3.38,438,2 +11.82,1.72,1.88,19.5,86,2.5,1.64,.37,1.42,2.06,.94,2.44,415,2 +12.51,1.73,1.98,20.5,85,2.2,1.92,.32,1.48,2.94,1.04,3.57,672,2 +12.42,2.55,2.27,22,90,1.68,1.84,.66,1.42,2.7,.86,3.3,315,2 +12.25,1.73,2.12,19,80,1.65,2.03,.37,1.63,3.4,1,3.17,510,2 +12.72,1.75,2.28,22.5,84,1.38,1.76,.48,1.63,3.3,.88,2.42,488,2 +12.22,1.29,1.94,19,92,2.36,2.04,.39,2.08,2.7,.86,3.02,312,2 +11.61,1.35,2.7,20,94,2.74,2.92,.29,2.49,2.65,.96,3.26,680,2 +11.46,3.74,1.82,19.5,107,3.18,2.58,.24,3.58,2.9,.75,2.81,562,2 +12.52,2.43,2.17,21,88,2.55,2.27,.26,1.22,2,.9,2.78,325,2 +11.76,2.68,2.92,20,103,1.75,2.03,.6,1.05,3.8,1.23,2.5,607,2 +11.41,.74,2.5,21,88,2.48,2.01,.42,1.44,3.08,1.1,2.31,434,2 +12.08,1.39,2.5,22.5,84,2.56,2.29,.43,1.04,2.9,.93,3.19,385,2 +11.03,1.51,2.2,21.5,85,2.46,2.17,.52,2.01,1.9,1.71,2.87,407,2 +11.82,1.47,1.99,20.8,86,1.98,1.6,.3,1.53,1.95,.95,3.33,495,2 +12.42,1.61,2.19,22.5,108,2,2.09,.34,1.61,2.06,1.06,2.96,345,2 +12.77,3.43,1.98,16,80,1.63,1.25,.43,.83,3.4,.7,2.12,372,2 +12,3.43,2,19,87,2,1.64,.37,1.87,1.28,.93,3.05,564,2 +11.45,2.4,2.42,20,96,2.9,2.79,.32,1.83,3.25,.8,3.39,625,2 +11.56,2.05,3.23,28.5,119,3.18,5.08,.47,1.87,6,.93,3.69,465,2 +12.42,4.43,2.73,26.5,102,2.2,2.13,.43,1.71,2.08,.92,3.12,365,2 +13.05,5.8,2.13,21.5,86,2.62,2.65,.3,2.01,2.6,.73,3.1,380,2 +11.87,4.31,2.39,21,82,2.86,3.03,.21,2.91,2.8,.75,3.64,380,2 +12.07,2.16,2.17,21,85,2.6,2.65,.37,1.35,2.76,.86,3.28,378,2 +12.43,1.53,2.29,21.5,86,2.74,3.15,.39,1.77,3.94,.69,2.84,352,2 +11.79,2.13,2.78,28.5,92,2.13,2.24,.58,1.76,3,.97,2.44,466,2 +12.37,1.63,2.3,24.5,88,2.22,2.45,.4,1.9,2.12,.89,2.78,342,2 +12.04,4.3,2.38,22,80,2.1,1.75,.42,1.35,2.6,.79,2.57,580,2 +12.86,1.35,2.32,18,122,1.51,1.25,.21,.94,4.1,.76,1.29,630,3 +12.88,2.99,2.4,20,104,1.3,1.22,.24,.83,5.4,.74,1.42,530,3 +12.81,2.31,2.4,24,98,1.15,1.09,.27,.83,5.7,.66,1.36,560,3 +12.7,3.55,2.36,21.5,106,1.7,1.2,.17,.84,5,.78,1.29,600,3 +12.51,1.24,2.25,17.5,85,2,.58,.6,1.25,5.45,.75,1.51,650,3 +12.6,2.46,2.2,18.5,94,1.62,.66,.63,.94,7.1,.73,1.58,695,3 +12.25,4.72,2.54,21,89,1.38,.47,.53,.8,3.85,.75,1.27,720,3 +12.53,5.51,2.64,25,96,1.79,.6,.63,1.1,5,.82,1.69,515,3 +13.49,3.59,2.19,19.5,88,1.62,.48,.58,.88,5.7,.81,1.82,580,3 +12.84,2.96,2.61,24,101,2.32,.6,.53,.81,4.92,.89,2.15,590,3 +12.93,2.81,2.7,21,96,1.54,.5,.53,.75,4.6,.77,2.31,600,3 +13.36,2.56,2.35,20,89,1.4,.5,.37,.64,5.6,.7,2.47,780,3 +13.52,3.17,2.72,23.5,97,1.55,.52,.5,.55,4.35,.89,2.06,520,3 +13.62,4.95,2.35,20,92,2,.8,.47,1.02,4.4,.91,2.05,550,3 +12.25,3.88,2.2,18.5,112,1.38,.78,.29,1.14,8.21,.65,2,855,3 +13.16,3.57,2.15,21,102,1.5,.55,.43,1.3,4,.6,1.68,830,3 +13.88,5.04,2.23,20,80,.98,.34,.4,.68,4.9,.58,1.33,415,3 +12.87,4.61,2.48,21.5,86,1.7,.65,.47,.86,7.65,.54,1.86,625,3 +13.32,3.24,2.38,21.5,92,1.93,.76,.45,1.25,8.42,.55,1.62,650,3 +13.08,3.9,2.36,21.5,113,1.41,1.39,.34,1.14,9.40,.57,1.33,550,3 +13.5,3.12,2.62,24,123,1.4,1.57,.22,1.25,8.60,.59,1.3,500,3 +12.79,2.67,2.48,22,112,1.48,1.36,.24,1.26,10.8,.48,1.47,480,3 +13.11,1.9,2.75,25.5,116,2.2,1.28,.26,1.56,7.1,.61,1.33,425,3 +13.23,3.3,2.28,18.5,98,1.8,.83,.61,1.87,10.52,.56,1.51,675,3 +12.58,1.29,2.1,20,103,1.48,.58,.53,1.4,7.6,.58,1.55,640,3 +13.17,5.19,2.32,22,93,1.74,.63,.61,1.55,7.9,.6,1.48,725,3 +13.84,4.12,2.38,19.5,89,1.8,.83,.48,1.56,9.01,.57,1.64,480,3 +12.45,3.03,2.64,27,97,1.9,.58,.63,1.14,7.5,.67,1.73,880,3 +14.34,1.68,2.7,25,98,2.8,1.31,.53,2.7,13,.57,1.96,660,3 +13.48,1.67,2.64,22.5,89,2.6,1.1,.52,2.29,11.75,.57,1.78,620,3 +12.36,3.83,2.38,21,88,2.3,.92,.5,1.04,7.65,.56,1.58,520,3 +13.69,3.26,2.54,20,107,1.83,.56,.5,.8,5.88,.96,1.82,680,3 +12.85,3.27,2.58,22,106,1.65,.6,.6,.96,5.58,.87,2.11,570,3 +12.96,3.45,2.35,18.5,106,1.39,.7,.4,.94,5.28,.68,1.75,675,3 +13.78,2.76,2.3,22,90,1.35,.68,.41,1.03,9.58,.7,1.68,615,3 +13.73,4.36,2.26,22.5,88,1.28,.47,.52,1.15,6.62,.78,1.75,520,3 +13.45,3.7,2.6,23,111,1.7,.92,.43,1.46,10.68,.85,1.56,695,3 +12.82,3.37,2.3,19.5,88,1.48,.66,.4,.97,10.26,.72,1.75,685,3 +13.58,2.58,2.69,24.5,105,1.55,.84,.39,1.54,8.66,.74,1.8,750,3 +13.4,4.6,2.86,25,112,1.98,.96,.27,1.11,8.5,.67,1.92,630,3 +12.2,3.03,2.32,19,96,1.25,.49,.4,.73,5.5,.66,1.83,510,3 +12.77,2.39,2.28,19.5,86,1.39,.51,.48,.64,9.899999,.57,1.63,470,3 +14.16,2.51,2.48,20,91,1.68,.7,.44,1.24,9.7,.62,1.71,660,3 +13.71,5.65,2.45,20.5,95,1.68,.61,.52,1.06,7.7,.64,1.74,740,3 +13.4,3.91,2.48,23,102,1.8,.75,.43,1.41,7.3,.7,1.56,750,3 +13.27,4.28,2.26,20,120,1.59,.69,.43,1.35,10.2,.59,1.56,835,3 +13.17,2.59,2.37,20,120,1.65,.68,.53,1.46,9.3,.6,1.62,840,3 +14.13,4.1,2.74,24.5,96,2.05,.76,.56,1.35,9.2,.61,1.6,560,3 \ No newline at end of file diff --git a/src/Phpml/Dataset/Demo/Wine.php b/src/Phpml/Dataset/Demo/Wine.php new file mode 100644 index 0000000..1ca48f0 --- /dev/null +++ b/src/Phpml/Dataset/Demo/Wine.php @@ -0,0 +1,22 @@ + $label) { - if ($label === $predictedLabels[$index]) { + if ($label == $predictedLabels[$index]) { ++$score; } } diff --git a/tests/Phpml/Classifier/KNearestNeighborsTest.php b/tests/Phpml/Classifier/KNearestNeighborsTest.php index 1050607..39edfb8 100644 --- a/tests/Phpml/Classifier/KNearestNeighborsTest.php +++ b/tests/Phpml/Classifier/KNearestNeighborsTest.php @@ -7,6 +7,7 @@ namespace tests\Classifier; use Phpml\Classifier\KNearestNeighbors; use Phpml\CrossValidation\RandomSplit; use Phpml\Dataset\Demo\Iris; +use Phpml\Dataset\Demo\Wine; use Phpml\Metric\Accuracy; class KNearestNeighborsTest extends \PHPUnit_Framework_TestCase @@ -55,4 +56,16 @@ class KNearestNeighborsTest extends \PHPUnit_Framework_TestCase $this->assertEquals(0.96, $score); } + + public function testAccuracyOnWineDataset() + { + $dataset = new RandomSplit(new Wine(), $testSize = 0.3, $seed = 321); + $classifier = new KNearestNeighbors(1); + $classifier->train($dataset->getTrainSamples(), $dataset->getTrainLabels()); + $predicted = $classifier->predict($dataset->getTestSamples()); + $score = Accuracy::score($dataset->getTestLabels(), $predicted); + + $this->assertEquals(0.85185185185185186, $score); + } + } diff --git a/tests/Phpml/Dataset/Demo/WineTest.php b/tests/Phpml/Dataset/Demo/WineTest.php new file mode 100644 index 0000000..5aa3a3b --- /dev/null +++ b/tests/Phpml/Dataset/Demo/WineTest.php @@ -0,0 +1,22 @@ +assertEquals(178, count($iris->getSamples())); + $this->assertEquals(178, count($iris->getLabels())); + + // one sample features count + $this->assertEquals(13, count($iris->getSamples()[0])); + } +} From c9c592cb093bf1a608977f715fbaf617edd7890f Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sat, 9 Apr 2016 15:46:54 +0200 Subject: [PATCH 03/59] add glass identification dataset --- data/glass.csv | 215 ++++++++++++++++++ src/Phpml/Dataset/Demo/Glass.php | 28 +++ .../Classifier/KNearestNeighborsTest.php | 12 + tests/Phpml/Dataset/Demo/GlassTest.php | 22 ++ 4 files changed, 277 insertions(+) create mode 100644 data/glass.csv create mode 100644 src/Phpml/Dataset/Demo/Glass.php create mode 100644 tests/Phpml/Dataset/Demo/GlassTest.php diff --git a/data/glass.csv b/data/glass.csv new file mode 100644 index 0000000..77522db --- /dev/null +++ b/data/glass.csv @@ -0,0 +1,215 @@ +RI: refractive index,Na: Sodium,Mg: Magnesium,Al: Aluminum,Si: Silicon,K: Potassium,Ca: Calcium,Ba: Barium,Fe: Iron,type of glass +1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.00,building_windows_float_processed +1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.00,building_windows_float_processed +1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.00,building_windows_float_processed +1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.00,building_windows_float_processed +1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.00,building_windows_float_processed +1.51596,12.79,3.61,1.62,72.97,0.64,8.07,0.00,0.26,building_windows_float_processed +1.51743,13.30,3.60,1.14,73.09,0.58,8.17,0.00,0.00,building_windows_float_processed +1.51756,13.15,3.61,1.05,73.24,0.57,8.24,0.00,0.00,building_windows_float_processed +1.51918,14.04,3.58,1.37,72.08,0.56,8.30,0.00,0.00,building_windows_float_processed +1.51755,13.00,3.60,1.36,72.99,0.57,8.40,0.00,0.11,building_windows_float_processed +1.51571,12.72,3.46,1.56,73.20,0.67,8.09,0.00,0.24,building_windows_float_processed +1.51763,12.80,3.66,1.27,73.01,0.60,8.56,0.00,0.00,building_windows_float_processed +1.51589,12.88,3.43,1.40,73.28,0.69,8.05,0.00,0.24,building_windows_float_processed +1.51748,12.86,3.56,1.27,73.21,0.54,8.38,0.00,0.17,building_windows_float_processed +1.51763,12.61,3.59,1.31,73.29,0.58,8.50,0.00,0.00,building_windows_float_processed +1.51761,12.81,3.54,1.23,73.24,0.58,8.39,0.00,0.00,building_windows_float_processed +1.51784,12.68,3.67,1.16,73.11,0.61,8.70,0.00,0.00,building_windows_float_processed +1.52196,14.36,3.85,0.89,71.36,0.15,9.15,0.00,0.00,building_windows_float_processed +1.51911,13.90,3.73,1.18,72.12,0.06,8.89,0.00,0.00,building_windows_float_processed +1.51735,13.02,3.54,1.69,72.73,0.54,8.44,0.00,0.07,building_windows_float_processed +1.51750,12.82,3.55,1.49,72.75,0.54,8.52,0.00,0.19,building_windows_float_processed +1.51966,14.77,3.75,0.29,72.02,0.03,9.00,0.00,0.00,building_windows_float_processed +1.51736,12.78,3.62,1.29,72.79,0.59,8.70,0.00,0.00,building_windows_float_processed +1.51751,12.81,3.57,1.35,73.02,0.62,8.59,0.00,0.00,building_windows_float_processed +1.51720,13.38,3.50,1.15,72.85,0.50,8.43,0.00,0.00,building_windows_float_processed +1.51764,12.98,3.54,1.21,73.00,0.65,8.53,0.00,0.00,building_windows_float_processed +1.51793,13.21,3.48,1.41,72.64,0.59,8.43,0.00,0.00,building_windows_float_processed +1.51721,12.87,3.48,1.33,73.04,0.56,8.43,0.00,0.00,building_windows_float_processed +1.51768,12.56,3.52,1.43,73.15,0.57,8.54,0.00,0.00,building_windows_float_processed +1.51784,13.08,3.49,1.28,72.86,0.60,8.49,0.00,0.00,building_windows_float_processed +1.51768,12.65,3.56,1.30,73.08,0.61,8.69,0.00,0.14,building_windows_float_processed +1.51747,12.84,3.50,1.14,73.27,0.56,8.55,0.00,0.00,building_windows_float_processed +1.51775,12.85,3.48,1.23,72.97,0.61,8.56,0.09,0.22,building_windows_float_processed +1.51753,12.57,3.47,1.38,73.39,0.60,8.55,0.00,0.06,building_windows_float_processed +1.51783,12.69,3.54,1.34,72.95,0.57,8.75,0.00,0.00,building_windows_float_processed +1.51567,13.29,3.45,1.21,72.74,0.56,8.57,0.00,0.00,building_windows_float_processed +1.51909,13.89,3.53,1.32,71.81,0.51,8.78,0.11,0.00,building_windows_float_processed +1.51797,12.74,3.48,1.35,72.96,0.64,8.68,0.00,0.00,building_windows_float_processed +1.52213,14.21,3.82,0.47,71.77,0.11,9.57,0.00,0.00,building_windows_float_processed +1.52213,14.21,3.82,0.47,71.77,0.11,9.57,0.00,0.00,building_windows_float_processed +1.51793,12.79,3.50,1.12,73.03,0.64,8.77,0.00,0.00,building_windows_float_processed +1.51755,12.71,3.42,1.20,73.20,0.59,8.64,0.00,0.00,building_windows_float_processed +1.51779,13.21,3.39,1.33,72.76,0.59,8.59,0.00,0.00,building_windows_float_processed +1.52210,13.73,3.84,0.72,71.76,0.17,9.74,0.00,0.00,building_windows_float_processed +1.51786,12.73,3.43,1.19,72.95,0.62,8.76,0.00,0.30,building_windows_float_processed +1.51900,13.49,3.48,1.35,71.95,0.55,9.00,0.00,0.00,building_windows_float_processed +1.51869,13.19,3.37,1.18,72.72,0.57,8.83,0.00,0.16,building_windows_float_processed +1.52667,13.99,3.70,0.71,71.57,0.02,9.82,0.00,0.10,building_windows_float_processed +1.52223,13.21,3.77,0.79,71.99,0.13,10.02,0.00,0.00,building_windows_float_processed +1.51898,13.58,3.35,1.23,72.08,0.59,8.91,0.00,0.00,building_windows_float_processed +1.52320,13.72,3.72,0.51,71.75,0.09,10.06,0.00,0.16,building_windows_float_processed +1.51926,13.20,3.33,1.28,72.36,0.60,9.14,0.00,0.11,building_windows_float_processed +1.51808,13.43,2.87,1.19,72.84,0.55,9.03,0.00,0.00,building_windows_float_processed +1.51837,13.14,2.84,1.28,72.85,0.55,9.07,0.00,0.00,building_windows_float_processed +1.51778,13.21,2.81,1.29,72.98,0.51,9.02,0.00,0.09,building_windows_float_processed +1.51769,12.45,2.71,1.29,73.70,0.56,9.06,0.00,0.24,building_windows_float_processed +1.51215,12.99,3.47,1.12,72.98,0.62,8.35,0.00,0.31,building_windows_float_processed +1.51824,12.87,3.48,1.29,72.95,0.60,8.43,0.00,0.00,building_windows_float_processed +1.51754,13.48,3.74,1.17,72.99,0.59,8.03,0.00,0.00,building_windows_float_processed +1.51754,13.39,3.66,1.19,72.79,0.57,8.27,0.00,0.11,building_windows_float_processed +1.51905,13.60,3.62,1.11,72.64,0.14,8.76,0.00,0.00,building_windows_float_processed +1.51977,13.81,3.58,1.32,71.72,0.12,8.67,0.69,0.00,building_windows_float_processed +1.52172,13.51,3.86,0.88,71.79,0.23,9.54,0.00,0.11,building_windows_float_processed +1.52227,14.17,3.81,0.78,71.35,0.00,9.69,0.00,0.00,building_windows_float_processed +1.52172,13.48,3.74,0.90,72.01,0.18,9.61,0.00,0.07,building_windows_float_processed +1.52099,13.69,3.59,1.12,71.96,0.09,9.40,0.00,0.00,building_windows_float_processed +1.52152,13.05,3.65,0.87,72.22,0.19,9.85,0.00,0.17,building_windows_float_processed +1.52152,13.05,3.65,0.87,72.32,0.19,9.85,0.00,0.17,building_windows_float_processed +1.52152,13.12,3.58,0.90,72.20,0.23,9.82,0.00,0.16,building_windows_float_processed +1.52300,13.31,3.58,0.82,71.99,0.12,10.17,0.00,0.03,building_windows_float_processed +1.51574,14.86,3.67,1.74,71.87,0.16,7.36,0.00,0.12,building_windows_non_float_processed +1.51848,13.64,3.87,1.27,71.96,0.54,8.32,0.00,0.32,building_windows_non_float_processed +1.51593,13.09,3.59,1.52,73.10,0.67,7.83,0.00,0.00,building_windows_non_float_processed +1.51631,13.34,3.57,1.57,72.87,0.61,7.89,0.00,0.00,building_windows_non_float_processed +1.51596,13.02,3.56,1.54,73.11,0.72,7.90,0.00,0.00,building_windows_non_float_processed +1.51590,13.02,3.58,1.51,73.12,0.69,7.96,0.00,0.00,building_windows_non_float_processed +1.51645,13.44,3.61,1.54,72.39,0.66,8.03,0.00,0.00,building_windows_non_float_processed +1.51627,13.00,3.58,1.54,72.83,0.61,8.04,0.00,0.00,building_windows_non_float_processed +1.51613,13.92,3.52,1.25,72.88,0.37,7.94,0.00,0.14,building_windows_non_float_processed +1.51590,12.82,3.52,1.90,72.86,0.69,7.97,0.00,0.00,building_windows_non_float_processed +1.51592,12.86,3.52,2.12,72.66,0.69,7.97,0.00,0.00,building_windows_non_float_processed +1.51593,13.25,3.45,1.43,73.17,0.61,7.86,0.00,0.00,building_windows_non_float_processed +1.51646,13.41,3.55,1.25,72.81,0.68,8.10,0.00,0.00,building_windows_non_float_processed +1.51594,13.09,3.52,1.55,72.87,0.68,8.05,0.00,0.09,building_windows_non_float_processed +1.51409,14.25,3.09,2.08,72.28,1.10,7.08,0.00,0.00,building_windows_non_float_processed +1.51625,13.36,3.58,1.49,72.72,0.45,8.21,0.00,0.00,building_windows_non_float_processed +1.51569,13.24,3.49,1.47,73.25,0.38,8.03,0.00,0.00,building_windows_non_float_processed +1.51645,13.40,3.49,1.52,72.65,0.67,8.08,0.00,0.10,building_windows_non_float_processed +1.51618,13.01,3.50,1.48,72.89,0.60,8.12,0.00,0.00,building_windows_non_float_processed +1.51640,12.55,3.48,1.87,73.23,0.63,8.08,0.00,0.09,building_windows_non_float_processed +1.51841,12.93,3.74,1.11,72.28,0.64,8.96,0.00,0.22,building_windows_non_float_processed +1.51605,12.90,3.44,1.45,73.06,0.44,8.27,0.00,0.00,building_windows_non_float_processed +1.51588,13.12,3.41,1.58,73.26,0.07,8.39,0.00,0.19,building_windows_non_float_processed +1.51590,13.24,3.34,1.47,73.10,0.39,8.22,0.00,0.00,building_windows_non_float_processed +1.51629,12.71,3.33,1.49,73.28,0.67,8.24,0.00,0.00,building_windows_non_float_processed +1.51860,13.36,3.43,1.43,72.26,0.51,8.60,0.00,0.00,building_windows_non_float_processed +1.51841,13.02,3.62,1.06,72.34,0.64,9.13,0.00,0.15,building_windows_non_float_processed +1.51743,12.20,3.25,1.16,73.55,0.62,8.90,0.00,0.24,building_windows_non_float_processed +1.51689,12.67,2.88,1.71,73.21,0.73,8.54,0.00,0.00,building_windows_non_float_processed +1.51811,12.96,2.96,1.43,72.92,0.60,8.79,0.14,0.00,building_windows_non_float_processed +1.51655,12.75,2.85,1.44,73.27,0.57,8.79,0.11,0.22,building_windows_non_float_processed +1.51730,12.35,2.72,1.63,72.87,0.70,9.23,0.00,0.00,building_windows_non_float_processed +1.51820,12.62,2.76,0.83,73.81,0.35,9.42,0.00,0.20,building_windows_non_float_processed +1.52725,13.80,3.15,0.66,70.57,0.08,11.64,0.00,0.00,building_windows_non_float_processed +1.52410,13.83,2.90,1.17,71.15,0.08,10.79,0.00,0.00,building_windows_non_float_processed +1.52475,11.45,0.00,1.88,72.19,0.81,13.24,0.00,0.34,building_windows_non_float_processed +1.53125,10.73,0.00,2.10,69.81,0.58,13.30,3.15,0.28,building_windows_non_float_processed +1.53393,12.30,0.00,1.00,70.16,0.12,16.19,0.00,0.24,building_windows_non_float_processed +1.52222,14.43,0.00,1.00,72.67,0.10,11.52,0.00,0.08,building_windows_non_float_processed +1.51818,13.72,0.00,0.56,74.45,0.00,10.99,0.00,0.00,building_windows_non_float_processed +1.52664,11.23,0.00,0.77,73.21,0.00,14.68,0.00,0.00,building_windows_non_float_processed +1.52739,11.02,0.00,0.75,73.08,0.00,14.96,0.00,0.00,building_windows_non_float_processed +1.52777,12.64,0.00,0.67,72.02,0.06,14.40,0.00,0.00,building_windows_non_float_processed +1.51892,13.46,3.83,1.26,72.55,0.57,8.21,0.00,0.14,building_windows_non_float_processed +1.51847,13.10,3.97,1.19,72.44,0.60,8.43,0.00,0.00,building_windows_non_float_processed +1.51846,13.41,3.89,1.33,72.38,0.51,8.28,0.00,0.00,building_windows_non_float_processed +1.51829,13.24,3.90,1.41,72.33,0.55,8.31,0.00,0.10,building_windows_non_float_processed +1.51708,13.72,3.68,1.81,72.06,0.64,7.88,0.00,0.00,building_windows_non_float_processed +1.51673,13.30,3.64,1.53,72.53,0.65,8.03,0.00,0.29,building_windows_non_float_processed +1.51652,13.56,3.57,1.47,72.45,0.64,7.96,0.00,0.00,building_windows_non_float_processed +1.51844,13.25,3.76,1.32,72.40,0.58,8.42,0.00,0.00,building_windows_non_float_processed +1.51663,12.93,3.54,1.62,72.96,0.64,8.03,0.00,0.21,building_windows_non_float_processed +1.51687,13.23,3.54,1.48,72.84,0.56,8.10,0.00,0.00,building_windows_non_float_processed +1.51707,13.48,3.48,1.71,72.52,0.62,7.99,0.00,0.00,building_windows_non_float_processed +1.52177,13.20,3.68,1.15,72.75,0.54,8.52,0.00,0.00,building_windows_non_float_processed +1.51872,12.93,3.66,1.56,72.51,0.58,8.55,0.00,0.12,building_windows_non_float_processed +1.51667,12.94,3.61,1.26,72.75,0.56,8.60,0.00,0.00,building_windows_non_float_processed +1.52081,13.78,2.28,1.43,71.99,0.49,9.85,0.00,0.17,building_windows_non_float_processed +1.52068,13.55,2.09,1.67,72.18,0.53,9.57,0.27,0.17,building_windows_non_float_processed +1.52020,13.98,1.35,1.63,71.76,0.39,10.56,0.00,0.18,building_windows_non_float_processed +1.52177,13.75,1.01,1.36,72.19,0.33,11.14,0.00,0.00,building_windows_non_float_processed +1.52614,13.70,0.00,1.36,71.24,0.19,13.44,0.00,0.10,building_windows_non_float_processed +1.51813,13.43,3.98,1.18,72.49,0.58,8.15,0.00,0.00,building_windows_non_float_processed +1.51800,13.71,3.93,1.54,71.81,0.54,8.21,0.00,0.15,building_windows_non_float_processed +1.51811,13.33,3.85,1.25,72.78,0.52,8.12,0.00,0.00,building_windows_non_float_processed +1.51789,13.19,3.90,1.30,72.33,0.55,8.44,0.00,0.28,building_windows_non_float_processed +1.51806,13.00,3.80,1.08,73.07,0.56,8.38,0.00,0.12,building_windows_non_float_processed +1.51711,12.89,3.62,1.57,72.96,0.61,8.11,0.00,0.00,building_windows_non_float_processed +1.51674,12.79,3.52,1.54,73.36,0.66,7.90,0.00,0.00,building_windows_non_float_processed +1.51674,12.87,3.56,1.64,73.14,0.65,7.99,0.00,0.00,building_windows_non_float_processed +1.51690,13.33,3.54,1.61,72.54,0.68,8.11,0.00,0.00,building_windows_non_float_processed +1.51851,13.20,3.63,1.07,72.83,0.57,8.41,0.09,0.17,building_windows_non_float_processed +1.51662,12.85,3.51,1.44,73.01,0.68,8.23,0.06,0.25,building_windows_non_float_processed +1.51709,13.00,3.47,1.79,72.72,0.66,8.18,0.00,0.00,building_windows_non_float_processed +1.51660,12.99,3.18,1.23,72.97,0.58,8.81,0.00,0.24,building_windows_non_float_processed +1.51839,12.85,3.67,1.24,72.57,0.62,8.68,0.00,0.35,building_windows_non_float_processed +1.51769,13.65,3.66,1.11,72.77,0.11,8.60,0.00,0.00,vehicle_windows_float_processed +1.51610,13.33,3.53,1.34,72.67,0.56,8.33,0.00,0.00,vehicle_windows_float_processed +1.51670,13.24,3.57,1.38,72.70,0.56,8.44,0.00,0.10,vehicle_windows_float_processed +1.51643,12.16,3.52,1.35,72.89,0.57,8.53,0.00,0.00,vehicle_windows_float_processed +1.51665,13.14,3.45,1.76,72.48,0.60,8.38,0.00,0.17,vehicle_windows_float_processed +1.52127,14.32,3.90,0.83,71.50,0.00,9.49,0.00,0.00,vehicle_windows_float_processed +1.51779,13.64,3.65,0.65,73.00,0.06,8.93,0.00,0.00,vehicle_windows_float_processed +1.51610,13.42,3.40,1.22,72.69,0.59,8.32,0.00,0.00,vehicle_windows_float_processed +1.51694,12.86,3.58,1.31,72.61,0.61,8.79,0.00,0.00,vehicle_windows_float_processed +1.51646,13.04,3.40,1.26,73.01,0.52,8.58,0.00,0.00,vehicle_windows_float_processed +1.51655,13.41,3.39,1.28,72.64,0.52,8.65,0.00,0.00,vehicle_windows_float_processed +1.52121,14.03,3.76,0.58,71.79,0.11,9.65,0.00,0.00,vehicle_windows_float_processed +1.51776,13.53,3.41,1.52,72.04,0.58,8.79,0.00,0.00,vehicle_windows_float_processed +1.51796,13.50,3.36,1.63,71.94,0.57,8.81,0.00,0.09,vehicle_windows_float_processed +1.51832,13.33,3.34,1.54,72.14,0.56,8.99,0.00,0.00,vehicle_windows_float_processed +1.51934,13.64,3.54,0.75,72.65,0.16,8.89,0.15,0.24,vehicle_windows_float_processed +1.52211,14.19,3.78,0.91,71.36,0.23,9.14,0.00,0.37,vehicle_windows_float_processed +1.51514,14.01,2.68,3.50,69.89,1.68,5.87,2.20,0.00,containers +1.51915,12.73,1.85,1.86,72.69,0.60,10.09,0.00,0.00,containers +1.52171,11.56,1.88,1.56,72.86,0.47,11.41,0.00,0.00,containers +1.52151,11.03,1.71,1.56,73.44,0.58,11.62,0.00,0.00,containers +1.51969,12.64,0.00,1.65,73.75,0.38,11.53,0.00,0.00,containers +1.51666,12.86,0.00,1.83,73.88,0.97,10.17,0.00,0.00,containers +1.51994,13.27,0.00,1.76,73.03,0.47,11.32,0.00,0.00,containers +1.52369,13.44,0.00,1.58,72.22,0.32,12.24,0.00,0.00,containers +1.51316,13.02,0.00,3.04,70.48,6.21,6.96,0.00,0.00,containers +1.51321,13.00,0.00,3.02,70.70,6.21,6.93,0.00,0.00,containers +1.52043,13.38,0.00,1.40,72.25,0.33,12.50,0.00,0.00,containers +1.52058,12.85,1.61,2.17,72.18,0.76,9.70,0.24,0.51,containers +1.52119,12.97,0.33,1.51,73.39,0.13,11.27,0.00,0.28,containers +1.51905,14.00,2.39,1.56,72.37,0.00,9.57,0.00,0.00,tableware +1.51937,13.79,2.41,1.19,72.76,0.00,9.77,0.00,0.00,tableware +1.51829,14.46,2.24,1.62,72.38,0.00,9.26,0.00,0.00,tableware +1.51852,14.09,2.19,1.66,72.67,0.00,9.32,0.00,0.00,tableware +1.51299,14.40,1.74,1.54,74.55,0.00,7.59,0.00,0.00,tableware +1.51888,14.99,0.78,1.74,72.50,0.00,9.95,0.00,0.00,tableware +1.51916,14.15,0.00,2.09,72.74,0.00,10.88,0.00,0.00,tableware +1.51969,14.56,0.00,0.56,73.48,0.00,11.22,0.00,0.00,tableware +1.51115,17.38,0.00,0.34,75.41,0.00,6.65,0.00,0.00,tableware +1.51131,13.69,3.20,1.81,72.81,1.76,5.43,1.19,0.00,headlamps +1.51838,14.32,3.26,2.22,71.25,1.46,5.79,1.63,0.00,headlamps +1.52315,13.44,3.34,1.23,72.38,0.60,8.83,0.00,0.00,headlamps +1.52247,14.86,2.20,2.06,70.26,0.76,9.76,0.00,0.00,headlamps +1.52365,15.79,1.83,1.31,70.43,0.31,8.61,1.68,0.00,headlamps +1.51613,13.88,1.78,1.79,73.10,0.00,8.67,0.76,0.00,headlamps +1.51602,14.85,0.00,2.38,73.28,0.00,8.76,0.64,0.09,headlamps +1.51623,14.20,0.00,2.79,73.46,0.04,9.04,0.40,0.09,headlamps +1.51719,14.75,0.00,2.00,73.02,0.00,8.53,1.59,0.08,headlamps +1.51683,14.56,0.00,1.98,73.29,0.00,8.52,1.57,0.07,headlamps +1.51545,14.14,0.00,2.68,73.39,0.08,9.07,0.61,0.05,headlamps +1.51556,13.87,0.00,2.54,73.23,0.14,9.41,0.81,0.01,headlamps +1.51727,14.70,0.00,2.34,73.28,0.00,8.95,0.66,0.00,headlamps +1.51531,14.38,0.00,2.66,73.10,0.04,9.08,0.64,0.00,headlamps +1.51609,15.01,0.00,2.51,73.05,0.05,8.83,0.53,0.00,headlamps +1.51508,15.15,0.00,2.25,73.50,0.00,8.34,0.63,0.00,headlamps +1.51653,11.95,0.00,1.19,75.18,2.70,8.93,0.00,0.00,headlamps +1.51514,14.85,0.00,2.42,73.72,0.00,8.39,0.56,0.00,headlamps +1.51658,14.80,0.00,1.99,73.11,0.00,8.28,1.71,0.00,headlamps +1.51617,14.95,0.00,2.27,73.30,0.00,8.71,0.67,0.00,headlamps +1.51732,14.95,0.00,1.80,72.99,0.00,8.61,1.55,0.00,headlamps +1.51645,14.94,0.00,1.87,73.11,0.00,8.67,1.38,0.00,headlamps +1.51831,14.39,0.00,1.82,72.86,1.41,6.47,2.88,0.00,headlamps +1.51640,14.37,0.00,2.74,72.85,0.00,9.45,0.54,0.00,headlamps +1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.00,headlamps +1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.00,headlamps +1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.00,headlamps +1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.00,headlamps +1.51711,14.23,0.00,2.08,73.36,0.00,8.62,1.67,0.00,headlamps diff --git a/src/Phpml/Dataset/Demo/Glass.php b/src/Phpml/Dataset/Demo/Glass.php new file mode 100644 index 0000000..bcf5e97 --- /dev/null +++ b/src/Phpml/Dataset/Demo/Glass.php @@ -0,0 +1,28 @@ +assertEquals(0.85185185185185186, $score); } + public function testAccuracyOnGlassDataset() + { + $dataset = new RandomSplit(new Glass(), $testSize = 0.3, $seed = 456); + $classifier = new KNearestNeighbors(7); + $classifier->train($dataset->getTrainSamples(), $dataset->getTrainLabels()); + $predicted = $classifier->predict($dataset->getTestSamples()); + $score = Accuracy::score($dataset->getTestLabels(), $predicted); + + $this->assertEquals(0.69230769230769229, $score); + } + } diff --git a/tests/Phpml/Dataset/Demo/GlassTest.php b/tests/Phpml/Dataset/Demo/GlassTest.php new file mode 100644 index 0000000..62cef3a --- /dev/null +++ b/tests/Phpml/Dataset/Demo/GlassTest.php @@ -0,0 +1,22 @@ +assertEquals(214, count($iris->getSamples())); + $this->assertEquals(214, count($iris->getLabels())); + + // one sample features count + $this->assertEquals(9, count($iris->getSamples()[0])); + } +} From a992f652005bbb3ea50ce6e12b3c79816e833875 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sat, 9 Apr 2016 15:50:48 +0200 Subject: [PATCH 04/59] remove accuracy score tests on datasets --- .../Classifier/KNearestNeighborsTest.php | 33 ------------------- 1 file changed, 33 deletions(-) diff --git a/tests/Phpml/Classifier/KNearestNeighborsTest.php b/tests/Phpml/Classifier/KNearestNeighborsTest.php index 7faf179..2839d35 100644 --- a/tests/Phpml/Classifier/KNearestNeighborsTest.php +++ b/tests/Phpml/Classifier/KNearestNeighborsTest.php @@ -47,37 +47,4 @@ class KNearestNeighborsTest extends \PHPUnit_Framework_TestCase $this->assertEquals($testLabels, $predicted); } - public function testAccuracyOnIrisDataset() - { - $dataset = new RandomSplit(new Iris(), $testSize = 0.5, $seed = 123); - $classifier = new KNearestNeighbors($k = 4); - $classifier->train($dataset->getTrainSamples(), $dataset->getTrainLabels()); - $predicted = $classifier->predict($dataset->getTestSamples()); - $score = Accuracy::score($dataset->getTestLabels(), $predicted); - - $this->assertEquals(0.96, $score); - } - - public function testAccuracyOnWineDataset() - { - $dataset = new RandomSplit(new Wine(), $testSize = 0.3, $seed = 321); - $classifier = new KNearestNeighbors(1); - $classifier->train($dataset->getTrainSamples(), $dataset->getTrainLabels()); - $predicted = $classifier->predict($dataset->getTestSamples()); - $score = Accuracy::score($dataset->getTestLabels(), $predicted); - - $this->assertEquals(0.85185185185185186, $score); - } - - public function testAccuracyOnGlassDataset() - { - $dataset = new RandomSplit(new Glass(), $testSize = 0.3, $seed = 456); - $classifier = new KNearestNeighbors(7); - $classifier->train($dataset->getTrainSamples(), $dataset->getTrainLabels()); - $predicted = $classifier->predict($dataset->getTestSamples()); - $score = Accuracy::score($dataset->getTestLabels(), $predicted); - - $this->assertEquals(0.69230769230769229, $score); - } - } From 171c6974e73f6409eaf7280882d81c1326014594 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sat, 9 Apr 2016 15:52:22 +0200 Subject: [PATCH 05/59] remove accuracy score tests on datasets --- src/Phpml/Dataset/Demo/Glass.php | 2 +- src/Phpml/Dataset/Demo/Wine.php | 2 +- tests/Phpml/Classifier/KNearestNeighborsTest.php | 6 ------ 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/Phpml/Dataset/Demo/Glass.php b/src/Phpml/Dataset/Demo/Glass.php index bcf5e97..2a3d7e2 100644 --- a/src/Phpml/Dataset/Demo/Glass.php +++ b/src/Phpml/Dataset/Demo/Glass.php @@ -16,7 +16,7 @@ use Phpml\Dataset\CsvDataset; * 9 tableware * 29 headlamps * Samples total: 214 - * Features per sample: 9 + * Features per sample: 9. */ class Glass extends CsvDataset { diff --git a/src/Phpml/Dataset/Demo/Wine.php b/src/Phpml/Dataset/Demo/Wine.php index 1ca48f0..3bc71a9 100644 --- a/src/Phpml/Dataset/Demo/Wine.php +++ b/src/Phpml/Dataset/Demo/Wine.php @@ -10,7 +10,7 @@ use Phpml\Dataset\CsvDataset; * Classes: 3 * Samples per class: class 1 59; class 2 71; class 3 48 * Samples total: 178 - * Features per sample: 13 + * Features per sample: 13. */ class Wine extends CsvDataset { diff --git a/tests/Phpml/Classifier/KNearestNeighborsTest.php b/tests/Phpml/Classifier/KNearestNeighborsTest.php index 2839d35..7b0fefe 100644 --- a/tests/Phpml/Classifier/KNearestNeighborsTest.php +++ b/tests/Phpml/Classifier/KNearestNeighborsTest.php @@ -5,11 +5,6 @@ declare (strict_types = 1); namespace tests\Classifier; use Phpml\Classifier\KNearestNeighbors; -use Phpml\CrossValidation\RandomSplit; -use Phpml\Dataset\Demo\Glass; -use Phpml\Dataset\Demo\Iris; -use Phpml\Dataset\Demo\Wine; -use Phpml\Metric\Accuracy; class KNearestNeighborsTest extends \PHPUnit_Framework_TestCase { @@ -46,5 +41,4 @@ class KNearestNeighborsTest extends \PHPUnit_Framework_TestCase $this->assertEquals($testLabels, $predicted); } - } From 3453b229cfec9e16628f89eaa2e2cf53c589d485 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sun, 10 Apr 2016 11:50:24 +0200 Subject: [PATCH 06/59] add travis config --- .travis.yml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..59de242 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,6 @@ +language: php +php: + - '7.0' + - hhvm +before_script: composer install +script: phpunit \ No newline at end of file From 9c83401e90639f5f7b29fcd5e2df3c9135d0295f Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sun, 10 Apr 2016 11:56:11 +0200 Subject: [PATCH 07/59] fix travis build --- .travis.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 59de242..247e222 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,5 @@ language: php php: - '7.0' - - hhvm before_script: composer install -script: phpunit \ No newline at end of file +script: bin/phpunit \ No newline at end of file From d169ebf730617ee5d7be3d167a86747770fa8041 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Mon, 11 Apr 2016 21:35:17 +0200 Subject: [PATCH 08/59] create Distance metrci interface and refactor classifier --- src/Phpml/Classifier/KNearestNeighbors.php | 16 ++++++--- src/Phpml/Dataset/Dataset.php | 1 + src/Phpml/Metric/Distance.php | 22 ++---------- src/Phpml/Metric/Distance/Euclidean.php | 35 +++++++++++++++++++ .../EuclideanTest.php} | 24 +++++++++---- 5 files changed, 67 insertions(+), 31 deletions(-) create mode 100644 src/Phpml/Metric/Distance/Euclidean.php rename tests/Phpml/Metric/{DistanceTest.php => Distance/EuclideanTest.php} (59%) diff --git a/src/Phpml/Classifier/KNearestNeighbors.php b/src/Phpml/Classifier/KNearestNeighbors.php index f913488..c0e58de 100644 --- a/src/Phpml/Classifier/KNearestNeighbors.php +++ b/src/Phpml/Classifier/KNearestNeighbors.php @@ -4,7 +4,7 @@ declare (strict_types = 1); namespace Phpml\Classifier; -use Phpml\Metric\Distance; +use Phpml\Metric\Distance\Euclidean; class KNearestNeighbors implements Classifier { @@ -13,6 +13,8 @@ class KNearestNeighbors implements Classifier */ private $k; + private $distanceMetric; + /** * @var array */ @@ -24,13 +26,19 @@ class KNearestNeighbors implements Classifier private $labels; /** - * @param int $k + * @param int $k + * @param Distance|null $distanceMetric (if null then Euclidean distance as default) */ - public function __construct(int $k = 3) + public function __construct(int $k = 3, Distance $distanceMetric = null) { + if (null === $distanceMetric) { + $distanceMetric = new Euclidean(); + } + $this->k = $k; $this->samples = []; $this->labels = []; + $this->distanceMetric = $distanceMetric; } /** @@ -95,7 +103,7 @@ class KNearestNeighbors implements Classifier $distances = []; foreach ($this->samples as $index => $neighbor) { - $distances[$index] = Distance::euclidean($sample, $neighbor); + $distances[$index] = $this->distanceMetric->distance($sample, $neighbor); } asort($distances); diff --git a/src/Phpml/Dataset/Dataset.php b/src/Phpml/Dataset/Dataset.php index 4e04931..2bc4043 100644 --- a/src/Phpml/Dataset/Dataset.php +++ b/src/Phpml/Dataset/Dataset.php @@ -6,6 +6,7 @@ namespace Phpml\Dataset; interface Dataset { + const SOME = 'z'; /** * @return array */ diff --git a/src/Phpml/Metric/Distance.php b/src/Phpml/Metric/Distance.php index 1b92cef..b590f61 100644 --- a/src/Phpml/Metric/Distance.php +++ b/src/Phpml/Metric/Distance.php @@ -4,31 +4,13 @@ declare (strict_types = 1); namespace Phpml\Metric; -use Phpml\Exception\InvalidArgumentException; - -class Distance +interface Distance { /** * @param array $a * @param array $b * * @return float - * - * @throws InvalidArgumentException */ - public static function euclidean(array $a, array $b): float - { - if (count($a) != count($b)) { - throw InvalidArgumentException::sizeNotMatch(); - } - - $distance = 0; - $count = count($a); - - for ($i = 0; $i < $count; ++$i) { - $distance += pow($a[$i] - $b[$i], 2); - } - - return sqrt($distance); - } + public function distance(array $a, array $b): float; } diff --git a/src/Phpml/Metric/Distance/Euclidean.php b/src/Phpml/Metric/Distance/Euclidean.php new file mode 100644 index 0000000..09d17c2 --- /dev/null +++ b/src/Phpml/Metric/Distance/Euclidean.php @@ -0,0 +1,35 @@ +distanceMetric = new Euclidean(); + } + /** * @expectedException \Phpml\Exception\InvalidArgumentException */ - public function testThrowExceptionOnInvalidArgumentsInEuclidean() + public function testThrowExceptionOnInvalidArguments() { $a = [0, 1, 2]; $b = [0, 2]; - Distance::euclidean($a, $b); + $this->distanceMetric->distance($a, $b); } public function testCalculateEuclideanDistanceForOneDimension() @@ -25,7 +35,7 @@ class DistanceTest extends \PHPUnit_Framework_TestCase $b = [2]; $expectedDistance = 2; - $actualDistance = Distance::euclidean($a, $b); + $actualDistance = $this->distanceMetric->distance($a, $b); $this->assertEquals($expectedDistance, $actualDistance); } @@ -36,7 +46,7 @@ class DistanceTest extends \PHPUnit_Framework_TestCase $b = [2, 5]; $expectedDistance = 2.2360679774998; - $actualDistance = Distance::euclidean($a, $b); + $actualDistance = $this->distanceMetric->distance($a, $b); $this->assertEquals($expectedDistance, $actualDistance); @@ -44,7 +54,7 @@ class DistanceTest extends \PHPUnit_Framework_TestCase $b = [2, 5, 5]; $expectedDistance = 6.7082039324993694; - $actualDistance = Distance::euclidean($a, $b); + $actualDistance = $this->distanceMetric->distance($a, $b); $this->assertEquals($expectedDistance, $actualDistance); } From 4d77a16e12f989f6c1dff9d349699967dafef1d5 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Mon, 11 Apr 2016 21:44:48 +0200 Subject: [PATCH 09/59] implement Chebyshev distance metric --- src/Phpml/Metric/Distance/Chebyshev.php | 35 ++++++++++ tests/Phpml/Metric/Distance/ChebyshevTest.php | 64 +++++++++++++++++++ tests/Phpml/Metric/Distance/EuclideanTest.php | 7 +- 3 files changed, 104 insertions(+), 2 deletions(-) create mode 100644 src/Phpml/Metric/Distance/Chebyshev.php create mode 100644 tests/Phpml/Metric/Distance/ChebyshevTest.php diff --git a/src/Phpml/Metric/Distance/Chebyshev.php b/src/Phpml/Metric/Distance/Chebyshev.php new file mode 100644 index 0000000..8146e4d --- /dev/null +++ b/src/Phpml/Metric/Distance/Chebyshev.php @@ -0,0 +1,35 @@ +distanceMetric = new Chebyshev(); + } + + /** + * @expectedException \Phpml\Exception\InvalidArgumentException + */ + public function testThrowExceptionOnInvalidArguments() + { + $a = [0, 1, 2]; + $b = [0, 2]; + + $this->distanceMetric->distance($a, $b); + } + + public function testCalculateDistanceForOneDimension() + { + $a = [4]; + $b = [2]; + + $expectedDistance = 2; + $actualDistance = $this->distanceMetric->distance($a, $b); + + $this->assertEquals($expectedDistance, $actualDistance); + } + + public function testCalculateDistanceForTwoDimensions() + { + $a = [4, 6]; + $b = [2, 5]; + + $expectedDistance = 2; + $actualDistance = $this->distanceMetric->distance($a, $b); + + $this->assertEquals($expectedDistance, $actualDistance); + } + + public function testCalculateDistanceForThreeDimensions() + { + $a = [6, 10, 3]; + $b = [2, 5, 5]; + + $expectedDistance = 5; + $actualDistance = $this->distanceMetric->distance($a, $b); + + $this->assertEquals($expectedDistance, $actualDistance); + } +} diff --git a/tests/Phpml/Metric/Distance/EuclideanTest.php b/tests/Phpml/Metric/Distance/EuclideanTest.php index 5022819..d9bf449 100644 --- a/tests/Phpml/Metric/Distance/EuclideanTest.php +++ b/tests/Phpml/Metric/Distance/EuclideanTest.php @@ -29,7 +29,7 @@ class EuclideanTest extends \PHPUnit_Framework_TestCase $this->distanceMetric->distance($a, $b); } - public function testCalculateEuclideanDistanceForOneDimension() + public function testCalculateDistanceForOneDimension() { $a = [4]; $b = [2]; @@ -40,7 +40,7 @@ class EuclideanTest extends \PHPUnit_Framework_TestCase $this->assertEquals($expectedDistance, $actualDistance); } - public function testCalculateEuclideanDistanceForTwoAndMoreDimension() + public function testCalculateDistanceForTwoDimensions() { $a = [4, 6]; $b = [2, 5]; @@ -49,7 +49,10 @@ class EuclideanTest extends \PHPUnit_Framework_TestCase $actualDistance = $this->distanceMetric->distance($a, $b); $this->assertEquals($expectedDistance, $actualDistance); + } + public function testCalculateDistanceForThreeDimensions() + { $a = [6, 10, 3]; $b = [2, 5, 5]; From 14bffbe38a001d8d92ff32c40a93df1719716a3c Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Mon, 11 Apr 2016 21:45:55 +0200 Subject: [PATCH 10/59] :implement Chebyshev distance metric --- tests/Phpml/Classifier/KNearestNeighborsTest.php | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/Phpml/Classifier/KNearestNeighborsTest.php b/tests/Phpml/Classifier/KNearestNeighborsTest.php index 7b0fefe..0b3e0d2 100644 --- a/tests/Phpml/Classifier/KNearestNeighborsTest.php +++ b/tests/Phpml/Classifier/KNearestNeighborsTest.php @@ -5,6 +5,7 @@ declare (strict_types = 1); namespace tests\Classifier; use Phpml\Classifier\KNearestNeighbors; +use Phpml\Metric\Distance\Chebyshev; class KNearestNeighborsTest extends \PHPUnit_Framework_TestCase { @@ -41,4 +42,6 @@ class KNearestNeighborsTest extends \PHPUnit_Framework_TestCase $this->assertEquals($testLabels, $predicted); } + + } From aed37e247eacad0b55f0383dcef30d93dd546c0d Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Mon, 11 Apr 2016 21:50:29 +0200 Subject: [PATCH 11/59] knn with chebyshev distance metric test --- src/Phpml/Classifier/KNearestNeighbors.php | 1 + tests/Phpml/Classifier/KNearestNeighborsTest.php | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/Phpml/Classifier/KNearestNeighbors.php b/src/Phpml/Classifier/KNearestNeighbors.php index c0e58de..9f37d98 100644 --- a/src/Phpml/Classifier/KNearestNeighbors.php +++ b/src/Phpml/Classifier/KNearestNeighbors.php @@ -4,6 +4,7 @@ declare (strict_types = 1); namespace Phpml\Classifier; +use Phpml\Metric\Distance; use Phpml\Metric\Distance\Euclidean; class KNearestNeighbors implements Classifier diff --git a/tests/Phpml/Classifier/KNearestNeighborsTest.php b/tests/Phpml/Classifier/KNearestNeighborsTest.php index 0b3e0d2..5e8ae58 100644 --- a/tests/Phpml/Classifier/KNearestNeighborsTest.php +++ b/tests/Phpml/Classifier/KNearestNeighborsTest.php @@ -43,5 +43,18 @@ class KNearestNeighborsTest extends \PHPUnit_Framework_TestCase $this->assertEquals($testLabels, $predicted); } - + public function testPredictArrayOfSamplesUsingChebyshevDistanceMetric() + { + $trainSamples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]]; + $trainLabels = ['a', 'a', 'a', 'b', 'b', 'b']; + + $testSamples = [[3, 2], [5, 1], [4, 3], [4, -5], [2, 3], [1, 2], [1, 5], [3, 10]]; + $testLabels = ['b', 'b', 'b', 'b', 'a', 'a', 'a', 'a']; + + $classifier = new KNearestNeighbors(3, new Chebyshev()); + $classifier->train($trainSamples, $trainLabels); + $predicted = $classifier->predict($testSamples); + + $this->assertEquals($testLabels, $predicted); + } } From d82a12497ab69f720bcefbf0f0a1e2fe554df259 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Tue, 12 Apr 2016 21:43:25 +0200 Subject: [PATCH 12/59] implement manhattan distance metric function --- src/Phpml/Metric/Distance/Manhattan.php | 35 ++++++++++ tests/Phpml/Metric/Distance/ManhattanTest.php | 64 +++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 src/Phpml/Metric/Distance/Manhattan.php create mode 100644 tests/Phpml/Metric/Distance/ManhattanTest.php diff --git a/src/Phpml/Metric/Distance/Manhattan.php b/src/Phpml/Metric/Distance/Manhattan.php new file mode 100644 index 0000000..a284993 --- /dev/null +++ b/src/Phpml/Metric/Distance/Manhattan.php @@ -0,0 +1,35 @@ +distanceMetric = new Manhattan(); + } + + /** + * @expectedException \Phpml\Exception\InvalidArgumentException + */ + public function testThrowExceptionOnInvalidArguments() + { + $a = [0, 1, 2]; + $b = [0, 2]; + + $this->distanceMetric->distance($a, $b); + } + + public function testCalculateDistanceForOneDimension() + { + $a = [4]; + $b = [2]; + + $expectedDistance = 2; + $actualDistance = $this->distanceMetric->distance($a, $b); + + $this->assertEquals($expectedDistance, $actualDistance); + } + + public function testCalculateDistanceForTwoDimensions() + { + $a = [4, 6]; + $b = [2, 5]; + + $expectedDistance = 3; + $actualDistance = $this->distanceMetric->distance($a, $b); + + $this->assertEquals($expectedDistance, $actualDistance); + } + + public function testCalculateDistanceForThreeDimensions() + { + $a = [6, 10, 3]; + $b = [2, 5, 5]; + + $expectedDistance = 11; + $actualDistance = $this->distanceMetric->distance($a, $b); + + $this->assertEquals($expectedDistance, $actualDistance); + } +} From 79b76fb1a4e4938949c0be9913cbae6228664ef7 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Tue, 12 Apr 2016 22:02:14 +0200 Subject: [PATCH 13/59] implement minkowski distance metric function --- src/Phpml/Metric/Distance/Minkowski.php | 49 ++++++++++++ tests/Phpml/Metric/Distance/MinkowskiTest.php | 77 +++++++++++++++++++ 2 files changed, 126 insertions(+) create mode 100644 src/Phpml/Metric/Distance/Minkowski.php create mode 100644 tests/Phpml/Metric/Distance/MinkowskiTest.php diff --git a/src/Phpml/Metric/Distance/Minkowski.php b/src/Phpml/Metric/Distance/Minkowski.php new file mode 100644 index 0000000..5e3c50f --- /dev/null +++ b/src/Phpml/Metric/Distance/Minkowski.php @@ -0,0 +1,49 @@ +lambda = $lambda; + } + + + /** + * @param array $a + * @param array $b + * + * @return float + * + * @throws InvalidArgumentException + */ + public function distance(array $a, array $b): float + { + if (count($a) !== count($b)) { + throw InvalidArgumentException::sizeNotMatch(); + } + + $distance = 0; + $count = count($a); + + for ($i = 0; $i < $count; ++$i) { + $distance += pow(abs($a[$i] - $b[$i]), $this->lambda); + } + + return pow($distance, 1 / $this->lambda); + } +} diff --git a/tests/Phpml/Metric/Distance/MinkowskiTest.php b/tests/Phpml/Metric/Distance/MinkowskiTest.php new file mode 100644 index 0000000..33ffc80 --- /dev/null +++ b/tests/Phpml/Metric/Distance/MinkowskiTest.php @@ -0,0 +1,77 @@ +distanceMetric = new Minkowski(); + } + + /** + * @expectedException \Phpml\Exception\InvalidArgumentException + */ + public function testThrowExceptionOnInvalidArguments() + { + $a = [0, 1, 2]; + $b = [0, 2]; + + $this->distanceMetric->distance($a, $b); + } + + public function testCalculateDistanceForOneDimension() + { + $a = [4]; + $b = [2]; + + $expectedDistance = 2; + $actualDistance = $this->distanceMetric->distance($a, $b); + + $this->assertEquals($expectedDistance, $actualDistance); + } + + public function testCalculateDistanceForTwoDimensions() + { + $a = [4, 6]; + $b = [2, 5]; + + $expectedDistance = 2.080; + $actualDistance = $this->distanceMetric->distance($a, $b); + + $this->assertEquals($expectedDistance, $actualDistance, '', $delta=0.001); + } + + public function testCalculateDistanceForThreeDimensions() + { + $a = [6, 10, 3]; + $b = [2, 5, 5]; + + $expectedDistance = 5.819; + $actualDistance = $this->distanceMetric->distance($a, $b); + + $this->assertEquals($expectedDistance, $actualDistance, '', $delta=0.001); + } + + public function testCalculateDistanceForThreeDimensionsWithDifferentLambda() + { + $distanceMetric = new Minkowski($lambda = 5); + + $a = [6, 10, 3]; + $b = [2, 5, 5]; + + $expectedDistance = 5.300; + $actualDistance = $distanceMetric->distance($a, $b); + + $this->assertEquals($expectedDistance, $actualDistance, '', $delta=0.001); + } +} From 85243f2d9214b07795dafe917880cb7260e65282 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Tue, 12 Apr 2016 23:10:33 +0200 Subject: [PATCH 14/59] cs-fixer --- src/Phpml/Metric/Distance/Minkowski.php | 1 - tests/Phpml/Metric/Distance/MinkowskiTest.php | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/Phpml/Metric/Distance/Minkowski.php b/src/Phpml/Metric/Distance/Minkowski.php index 5e3c50f..b8ec0ef 100644 --- a/src/Phpml/Metric/Distance/Minkowski.php +++ b/src/Phpml/Metric/Distance/Minkowski.php @@ -22,7 +22,6 @@ class Minkowski implements Distance $this->lambda = $lambda; } - /** * @param array $a * @param array $b diff --git a/tests/Phpml/Metric/Distance/MinkowskiTest.php b/tests/Phpml/Metric/Distance/MinkowskiTest.php index 33ffc80..78d9ef3 100644 --- a/tests/Phpml/Metric/Distance/MinkowskiTest.php +++ b/tests/Phpml/Metric/Distance/MinkowskiTest.php @@ -48,7 +48,7 @@ class MinkowskiTest extends \PHPUnit_Framework_TestCase $expectedDistance = 2.080; $actualDistance = $this->distanceMetric->distance($a, $b); - $this->assertEquals($expectedDistance, $actualDistance, '', $delta=0.001); + $this->assertEquals($expectedDistance, $actualDistance, '', $delta = 0.001); } public function testCalculateDistanceForThreeDimensions() @@ -59,7 +59,7 @@ class MinkowskiTest extends \PHPUnit_Framework_TestCase $expectedDistance = 5.819; $actualDistance = $this->distanceMetric->distance($a, $b); - $this->assertEquals($expectedDistance, $actualDistance, '', $delta=0.001); + $this->assertEquals($expectedDistance, $actualDistance, '', $delta = 0.001); } public function testCalculateDistanceForThreeDimensionsWithDifferentLambda() @@ -72,6 +72,6 @@ class MinkowskiTest extends \PHPUnit_Framework_TestCase $expectedDistance = 5.300; $actualDistance = $distanceMetric->distance($a, $b); - $this->assertEquals($expectedDistance, $actualDistance, '', $delta=0.001); + $this->assertEquals($expectedDistance, $actualDistance, '', $delta = 0.001); } } From 50fbcddfc4c097e9d9c185b3ad4673fe1ccec538 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Wed, 13 Apr 2016 21:20:55 +0200 Subject: [PATCH 15/59] create docs for distance metrics functions --- docs/machine-learning/metric/distance/chebyshev.md | 3 +++ .../metric/{distance.md => distance/euclidean.md} | 11 ++++++----- docs/machine-learning/metric/distance/manhattan.md | 1 + docs/machine-learning/metric/distance/minkowski.md | 1 + mkdocs.yml | 6 +++++- 5 files changed, 16 insertions(+), 6 deletions(-) create mode 100644 docs/machine-learning/metric/distance/chebyshev.md rename docs/machine-learning/metric/{distance.md => distance/euclidean.md} (57%) create mode 100644 docs/machine-learning/metric/distance/manhattan.md create mode 100644 docs/machine-learning/metric/distance/minkowski.md diff --git a/docs/machine-learning/metric/distance/chebyshev.md b/docs/machine-learning/metric/distance/chebyshev.md new file mode 100644 index 0000000..e4374b6 --- /dev/null +++ b/docs/machine-learning/metric/distance/chebyshev.md @@ -0,0 +1,3 @@ +# Chebyshev + +Class for calculation Chebyshev distance. diff --git a/docs/machine-learning/metric/distance.md b/docs/machine-learning/metric/distance/euclidean.md similarity index 57% rename from docs/machine-learning/metric/distance.md rename to docs/machine-learning/metric/distance/euclidean.md index de8bcb1..91642af 100644 --- a/docs/machine-learning/metric/distance.md +++ b/docs/machine-learning/metric/distance/euclidean.md @@ -1,17 +1,18 @@ -# Distance +# Euclidean -Special class for calculation of different types of distance. +Class for calculation Euclidean distance. ### Euclidean ![euclidean](https://upload.wikimedia.org/math/8/4/9/849f040fd10bb86f7c85eb0bbe3566a4.png "Euclidean Distance") -To calculate euclidean distance: +To calculate distance: ``` $a = [4, 6]; $b = [2, 5]; - -Distance::euclidean($a, $b); + +$euclidean = new Euclidean(); +$euclidean->distance($a, $b); // return 2.2360679774998 ``` diff --git a/docs/machine-learning/metric/distance/manhattan.md b/docs/machine-learning/metric/distance/manhattan.md new file mode 100644 index 0000000..b29f739 --- /dev/null +++ b/docs/machine-learning/metric/distance/manhattan.md @@ -0,0 +1 @@ +# Manhattan diff --git a/docs/machine-learning/metric/distance/minkowski.md b/docs/machine-learning/metric/distance/minkowski.md new file mode 100644 index 0000000..aac44b8 --- /dev/null +++ b/docs/machine-learning/metric/distance/minkowski.md @@ -0,0 +1 @@ +# Minkowski diff --git a/mkdocs.yml b/mkdocs.yml index f20036f..92240e7 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -13,5 +13,9 @@ pages: - Iris: machine-learning/datasets/demo/iris.md - Metric: - Accuracy: machine-learning/metric/accuracy.md - - Distance: machine-learning/metric/distance.md + - Distance: + - Euclidean: machine-learning/metric/distance/euclidean.md + - Chebyshev: machine-learning/metric/distance/chebyshev.md + - Manhattan: machine-learning/metric/distance/manhattan.md + - Minkowski: machine-learning/metric/distance/minkowski.md theme: readthedocs \ No newline at end of file From 100205d76778aa008da9b6ce0ebc1be56355800a Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Thu, 14 Apr 2016 22:56:54 +0200 Subject: [PATCH 16/59] simple Naive Bayes classifier --- src/Phpml/Classifier/NaiveBayes.php | 45 +++++++++++++++++++++++ tests/Phpml/Classifier/NaiveBayesTest.php | 38 +++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 tests/Phpml/Classifier/NaiveBayesTest.php diff --git a/src/Phpml/Classifier/NaiveBayes.php b/src/Phpml/Classifier/NaiveBayes.php index 7324d79..05cb120 100644 --- a/src/Phpml/Classifier/NaiveBayes.php +++ b/src/Phpml/Classifier/NaiveBayes.php @@ -6,12 +6,24 @@ namespace Phpml\Classifier; class NaiveBayes implements Classifier { + /** + * @var array + */ + private $samples; + + /** + * @var array + */ + private $labels; + /** * @param array $samples * @param array $labels */ public function train(array $samples, array $labels) { + $this->samples = $samples; + $this->labels = $labels; } /** @@ -21,5 +33,38 @@ class NaiveBayes implements Classifier */ public function predict(array $samples) { + if (!is_array($samples[0])) { + $predicted = $this->predictSample($samples); + } else { + $predicted = []; + foreach ($samples as $index => $sample) { + $predicted[$index] = $this->predictSample($sample); + } + } + + return $predicted; + } + + /** + * @param array $sample + * + * @return mixed + */ + private function predictSample(array $sample) + { + $predictions = []; + foreach ($this->labels as $index => $label) { + $predictions[$label] = 0; + foreach ($sample as $token => $count) { + if (array_key_exists($token, $this->samples[$index])) { + $predictions[$label] += $count * $this->samples[$index][$token]; + } + } + } + + arsort($predictions, SORT_NUMERIC); + reset($predictions); + + return key($predictions); } } diff --git a/tests/Phpml/Classifier/NaiveBayesTest.php b/tests/Phpml/Classifier/NaiveBayesTest.php new file mode 100644 index 0000000..ce52bbc --- /dev/null +++ b/tests/Phpml/Classifier/NaiveBayesTest.php @@ -0,0 +1,38 @@ +train($samples, $labels); + + $this->assertEquals('a', $classifier->predict([3, 1, 1])); + $this->assertEquals('b', $classifier->predict([1, 4, 1])); + $this->assertEquals('c', $classifier->predict([1, 1, 6])); + } + + public function testPredictArrayOfSamples() + { + $trainSamples = [[5, 1, 1], [1, 5, 1], [1, 1, 5]]; + $trainLabels = ['a', 'b', 'c']; + + $testSamples = [[3, 1, 1], [5, 1, 1], [4, 3, 8], [1, 1, 2], [2, 3, 2], [1, 2, 1], [9, 5, 1], [3, 1, 2]]; + $testLabels = ['a', 'a', 'c', 'c', 'b', 'b', 'a', 'a']; + + $classifier = new NaiveBayes(); + $classifier->train($trainSamples, $trainLabels); + $predicted = $classifier->predict($testSamples); + + $this->assertEquals($testLabels, $predicted); + } +} From 6f5f19060087fa47edc8f5d4fde50a222c153d3d Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Fri, 15 Apr 2016 22:32:20 +0200 Subject: [PATCH 17/59] docs for manhattan distance --- .../machine-learning/metric/distance/euclidean.md | 2 -- .../machine-learning/metric/distance/manhattan.md | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/docs/machine-learning/metric/distance/euclidean.md b/docs/machine-learning/metric/distance/euclidean.md index 91642af..decc18b 100644 --- a/docs/machine-learning/metric/distance/euclidean.md +++ b/docs/machine-learning/metric/distance/euclidean.md @@ -2,8 +2,6 @@ Class for calculation Euclidean distance. -### Euclidean - ![euclidean](https://upload.wikimedia.org/math/8/4/9/849f040fd10bb86f7c85eb0bbe3566a4.png "Euclidean Distance") To calculate distance: diff --git a/docs/machine-learning/metric/distance/manhattan.md b/docs/machine-learning/metric/distance/manhattan.md index b29f739..a1502cc 100644 --- a/docs/machine-learning/metric/distance/manhattan.md +++ b/docs/machine-learning/metric/distance/manhattan.md @@ -1 +1,16 @@ # Manhattan + +Class for calculation Manhattan distance. + +![manhattan](https://upload.wikimedia.org/math/4/c/5/4c568bd1d76a6b15e19cb2ac3ad75350.png "Manhattan Distance") + +To calculate distance: + +``` +$a = [4, 6]; +$b = [2, 5]; + +$manhattan = new Manhattan(); +$manhattan->distance($a, $b); +// return 3 +``` From a4ab370a4853b1914a603794eb75b47c1b92968f Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sat, 16 Apr 2016 21:24:40 +0200 Subject: [PATCH 18/59] create traits for reduce complexity --- src/Phpml/Classifier/KNearestNeighbors.php | 46 ++++----------------- src/Phpml/Classifier/NaiveBayes.php | 43 +++---------------- src/Phpml/Classifier/Traits/Predictable.php | 27 ++++++++++++ src/Phpml/Classifier/Traits/Trainable.php | 29 +++++++++++++ tests/Phpml/Dataset/CsvDatasetTest.php | 12 +++++- 5 files changed, 79 insertions(+), 78 deletions(-) create mode 100644 src/Phpml/Classifier/Traits/Predictable.php create mode 100644 src/Phpml/Classifier/Traits/Trainable.php diff --git a/src/Phpml/Classifier/KNearestNeighbors.php b/src/Phpml/Classifier/KNearestNeighbors.php index 9f37d98..4747dc3 100644 --- a/src/Phpml/Classifier/KNearestNeighbors.php +++ b/src/Phpml/Classifier/KNearestNeighbors.php @@ -4,28 +4,25 @@ declare (strict_types = 1); namespace Phpml\Classifier; +use Phpml\Classifier\Traits\Predictable; +use Phpml\Classifier\Traits\Trainable; use Phpml\Metric\Distance; use Phpml\Metric\Distance\Euclidean; class KNearestNeighbors implements Classifier { + use Trainable, Predictable; + /** * @var int */ private $k; + /** + * @var Distance + */ private $distanceMetric; - /** - * @var array - */ - private $samples; - - /** - * @var array - */ - private $labels; - /** * @param int $k * @param Distance|null $distanceMetric (if null then Euclidean distance as default) @@ -42,35 +39,6 @@ class KNearestNeighbors implements Classifier $this->distanceMetric = $distanceMetric; } - /** - * @param array $samples - * @param array $labels - */ - public function train(array $samples, array $labels) - { - $this->samples = $samples; - $this->labels = $labels; - } - - /** - * @param array $samples - * - * @return mixed - */ - public function predict(array $samples) - { - if (!is_array($samples[0])) { - $predicted = $this->predictSample($samples); - } else { - $predicted = []; - foreach ($samples as $index => $sample) { - $predicted[$index] = $this->predictSample($sample); - } - } - - return $predicted; - } - /** * @param array $sample * diff --git a/src/Phpml/Classifier/NaiveBayes.php b/src/Phpml/Classifier/NaiveBayes.php index 05cb120..ba3881b 100644 --- a/src/Phpml/Classifier/NaiveBayes.php +++ b/src/Phpml/Classifier/NaiveBayes.php @@ -4,46 +4,12 @@ declare (strict_types = 1); namespace Phpml\Classifier; +use Phpml\Classifier\Traits\Predictable; +use Phpml\Classifier\Traits\Trainable; + class NaiveBayes implements Classifier { - /** - * @var array - */ - private $samples; - - /** - * @var array - */ - private $labels; - - /** - * @param array $samples - * @param array $labels - */ - public function train(array $samples, array $labels) - { - $this->samples = $samples; - $this->labels = $labels; - } - - /** - * @param array $samples - * - * @return mixed - */ - public function predict(array $samples) - { - if (!is_array($samples[0])) { - $predicted = $this->predictSample($samples); - } else { - $predicted = []; - foreach ($samples as $index => $sample) { - $predicted[$index] = $this->predictSample($sample); - } - } - - return $predicted; - } + use Trainable, Predictable; /** * @param array $sample @@ -67,4 +33,5 @@ class NaiveBayes implements Classifier return key($predictions); } + } diff --git a/src/Phpml/Classifier/Traits/Predictable.php b/src/Phpml/Classifier/Traits/Predictable.php new file mode 100644 index 0000000..7c22585 --- /dev/null +++ b/src/Phpml/Classifier/Traits/Predictable.php @@ -0,0 +1,27 @@ +predictSample($samples); + } else { + $predicted = []; + foreach ($samples as $index => $sample) { + $predicted[$index] = $this->predictSample($sample); + } + } + + return $predicted; + } + +} diff --git a/src/Phpml/Classifier/Traits/Trainable.php b/src/Phpml/Classifier/Traits/Trainable.php new file mode 100644 index 0000000..3e66af6 --- /dev/null +++ b/src/Phpml/Classifier/Traits/Trainable.php @@ -0,0 +1,29 @@ +samples = $samples; + $this->labels = $labels; + } + +} diff --git a/tests/Phpml/Dataset/CsvDatasetTest.php b/tests/Phpml/Dataset/CsvDatasetTest.php index db87d62..2994504 100644 --- a/tests/Phpml/Dataset/CsvDatasetTest.php +++ b/tests/Phpml/Dataset/CsvDatasetTest.php @@ -16,7 +16,7 @@ class CsvDatasetTest extends \PHPUnit_Framework_TestCase new CsvDataset('missingFile', 3); } - public function testSampleCsvDataset() + public function testSampleCsvDatasetWithHeaderRow() { $filePath = dirname(__FILE__).'/Resources/dataset.csv'; @@ -25,4 +25,14 @@ class CsvDatasetTest extends \PHPUnit_Framework_TestCase $this->assertEquals(10, count($dataset->getSamples())); $this->assertEquals(10, count($dataset->getLabels())); } + + public function testSampleCsvDatasetWithoutHeaderRow() + { + $filePath = dirname(__FILE__).'/Resources/dataset.csv'; + + $dataset = new CsvDataset($filePath, 2, false); + + $this->assertEquals(11, count($dataset->getSamples())); + $this->assertEquals(11, count($dataset->getLabels())); + } } From dea6972305c04eeb67e4b4078e5bb21b945414c7 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sat, 16 Apr 2016 21:26:58 +0200 Subject: [PATCH 19/59] change csv dataset constructor --- src/Phpml/Dataset/CsvDataset.php | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/Phpml/Dataset/CsvDataset.php b/src/Phpml/Dataset/CsvDataset.php index e6dafd2..dca2792 100644 --- a/src/Phpml/Dataset/CsvDataset.php +++ b/src/Phpml/Dataset/CsvDataset.php @@ -26,19 +26,19 @@ class CsvDataset extends ArrayDataset throw DatasetException::missingFile(basename($filepath)); } - $row = 0; - if (($handle = fopen($filepath, 'r')) !== false) { - while (($data = fgetcsv($handle, 1000, ',')) !== false) { - ++$row; - if ($headingRow && $row == 1) { - continue; - } - $this->samples[] = array_slice($data, 0, $features); - $this->labels[] = $data[$features]; - } - fclose($handle); - } else { + if(false === $handle = fopen($filepath, 'r')) { throw DatasetException::cantOpenFile(basename($filepath)); } + $row = 0; + while (($data = fgetcsv($handle, 1000, ',')) !== false) { + ++$row; + if ($headingRow && $row == 1) { + continue; + } + $this->samples[] = array_slice($data, 0, $features); + $this->labels[] = $data[$features]; + } + fclose($handle); + } } From 5170c1077325f5c0b929f2934d9955de37bd716a Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sat, 16 Apr 2016 21:34:50 +0200 Subject: [PATCH 20/59] simplify csv dataset --- src/Phpml/Dataset/CsvDataset.php | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/Phpml/Dataset/CsvDataset.php b/src/Phpml/Dataset/CsvDataset.php index dca2792..f2c8ca7 100644 --- a/src/Phpml/Dataset/CsvDataset.php +++ b/src/Phpml/Dataset/CsvDataset.php @@ -29,16 +29,15 @@ class CsvDataset extends ArrayDataset if(false === $handle = fopen($filepath, 'r')) { throw DatasetException::cantOpenFile(basename($filepath)); } - $row = 0; + + if ($headingRow) { + fgets($handle); + } + while (($data = fgetcsv($handle, 1000, ',')) !== false) { - ++$row; - if ($headingRow && $row == 1) { - continue; - } $this->samples[] = array_slice($data, 0, $features); $this->labels[] = $data[$features]; } fclose($handle); - } } From d2e0ce446cfbb30db3d113575853346afe1e1943 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sat, 16 Apr 2016 21:41:37 +0200 Subject: [PATCH 21/59] update classifier docs --- .../classification/knearestneighbors.md | 2 ++ .../classification/naivebayes.md | 27 +++++++++++++++++++ mkdocs.yml | 1 + src/Phpml/Classifier/NaiveBayes.php | 1 - src/Phpml/Classifier/Traits/Predictable.php | 4 +-- src/Phpml/Classifier/Traits/Trainable.php | 5 ++-- src/Phpml/Dataset/CsvDataset.php | 2 +- 7 files changed, 35 insertions(+), 7 deletions(-) create mode 100644 docs/machine-learning/classification/naivebayes.md diff --git a/docs/machine-learning/classification/knearestneighbors.md b/docs/machine-learning/classification/knearestneighbors.md index 569c48b..7d16828 100644 --- a/docs/machine-learning/classification/knearestneighbors.md +++ b/docs/machine-learning/classification/knearestneighbors.md @@ -5,9 +5,11 @@ Classifier implementing the k-nearest neighbors algorithm. ### Constructor Parameters * $k - number of nearest neighbors to scan (default: 3) +* $distanceMetric - Distance class, default Euclidean (see Distance Metric documentation) ``` $classifier = new KNearestNeighbors($k=4); +$classifier = new KNearestNeighbors($k=3, new Minkowski($lambda=4)); ``` ### Train diff --git a/docs/machine-learning/classification/naivebayes.md b/docs/machine-learning/classification/naivebayes.md new file mode 100644 index 0000000..c700106 --- /dev/null +++ b/docs/machine-learning/classification/naivebayes.md @@ -0,0 +1,27 @@ +# NaiveBayes Classifier + +Classifier based on applying Bayes' theorem with strong (naive) independence assumptions between the features. + +### Train + +To train a classifier simply provide train samples and labels (as `array`): + +``` +$samples = [[5, 1, 1], [1, 5, 1], [1, 1, 5]]; +$labels = ['a', 'b', 'c']; + +$classifier = new NaiveBayes(); +$classifier->train($samples, $labels); +``` + +### Predict + +To predict sample class use `predict` method. You can provide one sample or array of samples: + +``` +$classifier->predict([3, 1, 1]); +// return 'a' + +$classifier->predict([[3, 1, 1], [1, 4, 1]); +// return ['a', 'b'] +``` diff --git a/mkdocs.yml b/mkdocs.yml index 92240e7..55d1914 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -4,6 +4,7 @@ pages: - Machine Learning: - Classification: - KNearestNeighbors: machine-learning/classification/knearestneighbors.md + - NaiveBayes: machine-learning/classification/naivebayes.md - Cross Validation: - RandomSplit: machine-learning/cross-validation/randomsplit.md - Datasets: diff --git a/src/Phpml/Classifier/NaiveBayes.php b/src/Phpml/Classifier/NaiveBayes.php index ba3881b..4eeb865 100644 --- a/src/Phpml/Classifier/NaiveBayes.php +++ b/src/Phpml/Classifier/NaiveBayes.php @@ -33,5 +33,4 @@ class NaiveBayes implements Classifier return key($predictions); } - } diff --git a/src/Phpml/Classifier/Traits/Predictable.php b/src/Phpml/Classifier/Traits/Predictable.php index 7c22585..8090f8c 100644 --- a/src/Phpml/Classifier/Traits/Predictable.php +++ b/src/Phpml/Classifier/Traits/Predictable.php @@ -1,5 +1,6 @@ samples = $samples; $this->labels = $labels; } - } diff --git a/src/Phpml/Dataset/CsvDataset.php b/src/Phpml/Dataset/CsvDataset.php index f2c8ca7..7d1f91e 100644 --- a/src/Phpml/Dataset/CsvDataset.php +++ b/src/Phpml/Dataset/CsvDataset.php @@ -26,7 +26,7 @@ class CsvDataset extends ArrayDataset throw DatasetException::missingFile(basename($filepath)); } - if(false === $handle = fopen($filepath, 'r')) { + if (false === $handle = fopen($filepath, 'r')) { throw DatasetException::cantOpenFile(basename($filepath)); } From e5ca6c83f1e70ec73dc52dfc943e46b8a71ac697 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sun, 17 Apr 2016 17:10:26 +0200 Subject: [PATCH 22/59] some bug in code --- src/Phpml/Metric/Distance/Minkowski.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Phpml/Metric/Distance/Minkowski.php b/src/Phpml/Metric/Distance/Minkowski.php index b8ec0ef..89273d6 100644 --- a/src/Phpml/Metric/Distance/Minkowski.php +++ b/src/Phpml/Metric/Distance/Minkowski.php @@ -36,7 +36,7 @@ class Minkowski implements Distance throw InvalidArgumentException::sizeNotMatch(); } - $distance = 0; + $distance = 1; $count = count($a); for ($i = 0; $i < $count; ++$i) { From ded28def2d1c4531c8ff81d07fd83973511b9332 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sun, 17 Apr 2016 17:47:30 +0200 Subject: [PATCH 23/59] fix minkowski distance --- src/Phpml/Metric/Distance/Minkowski.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Phpml/Metric/Distance/Minkowski.php b/src/Phpml/Metric/Distance/Minkowski.php index 89273d6..b8ec0ef 100644 --- a/src/Phpml/Metric/Distance/Minkowski.php +++ b/src/Phpml/Metric/Distance/Minkowski.php @@ -36,7 +36,7 @@ class Minkowski implements Distance throw InvalidArgumentException::sizeNotMatch(); } - $distance = 1; + $distance = 0; $count = count($a); for ($i = 0; $i < $count; ++$i) { From b04cf220bdbba4d23e828f5297b11110e3b0be38 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Mon, 18 Apr 2016 22:58:43 +0200 Subject: [PATCH 24/59] rename exception named constructor name --- src/Phpml/Dataset/ArrayDataset.php | 2 +- src/Phpml/Exception/InvalidArgumentException.php | 4 ++-- src/Phpml/Metric/Accuracy.php | 2 +- src/Phpml/Metric/Distance/Chebyshev.php | 2 +- src/Phpml/Metric/Distance/Euclidean.php | 2 +- src/Phpml/Metric/Distance/Manhattan.php | 2 +- src/Phpml/Metric/Distance/Minkowski.php | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/Phpml/Dataset/ArrayDataset.php b/src/Phpml/Dataset/ArrayDataset.php index d117122..7c5c2b5 100644 --- a/src/Phpml/Dataset/ArrayDataset.php +++ b/src/Phpml/Dataset/ArrayDataset.php @@ -27,7 +27,7 @@ class ArrayDataset implements Dataset public function __construct(array $samples, array $labels) { if (count($samples) != count($labels)) { - throw InvalidArgumentException::sizeNotMatch(); + throw InvalidArgumentException::arraySizeNotMatch(); } $this->samples = $samples; diff --git a/src/Phpml/Exception/InvalidArgumentException.php b/src/Phpml/Exception/InvalidArgumentException.php index 48006fa..428f637 100644 --- a/src/Phpml/Exception/InvalidArgumentException.php +++ b/src/Phpml/Exception/InvalidArgumentException.php @@ -9,9 +9,9 @@ class InvalidArgumentException extends \Exception /** * @return InvalidArgumentException */ - public static function sizeNotMatch() + public static function arraySizeNotMatch() { - return new self('Size of given arguments not match'); + return new self('Size of given arrays not match'); } /** diff --git a/src/Phpml/Metric/Accuracy.php b/src/Phpml/Metric/Accuracy.php index 3ad806d..0565ead 100644 --- a/src/Phpml/Metric/Accuracy.php +++ b/src/Phpml/Metric/Accuracy.php @@ -20,7 +20,7 @@ class Accuracy public static function score(array $actualLabels, array $predictedLabels, bool $normalize = true) { if (count($actualLabels) != count($predictedLabels)) { - throw InvalidArgumentException::sizeNotMatch(); + throw InvalidArgumentException::arraySizeNotMatch(); } $score = 0; diff --git a/src/Phpml/Metric/Distance/Chebyshev.php b/src/Phpml/Metric/Distance/Chebyshev.php index 8146e4d..0dab5e8 100644 --- a/src/Phpml/Metric/Distance/Chebyshev.php +++ b/src/Phpml/Metric/Distance/Chebyshev.php @@ -20,7 +20,7 @@ class Chebyshev implements Distance public function distance(array $a, array $b): float { if (count($a) !== count($b)) { - throw InvalidArgumentException::sizeNotMatch(); + throw InvalidArgumentException::arraySizeNotMatch(); } $differences = []; diff --git a/src/Phpml/Metric/Distance/Euclidean.php b/src/Phpml/Metric/Distance/Euclidean.php index 09d17c2..da3d2e3 100644 --- a/src/Phpml/Metric/Distance/Euclidean.php +++ b/src/Phpml/Metric/Distance/Euclidean.php @@ -20,7 +20,7 @@ class Euclidean implements Distance public function distance(array $a, array $b): float { if (count($a) !== count($b)) { - throw InvalidArgumentException::sizeNotMatch(); + throw InvalidArgumentException::arraySizeNotMatch(); } $distance = 0; diff --git a/src/Phpml/Metric/Distance/Manhattan.php b/src/Phpml/Metric/Distance/Manhattan.php index a284993..f69dc42 100644 --- a/src/Phpml/Metric/Distance/Manhattan.php +++ b/src/Phpml/Metric/Distance/Manhattan.php @@ -20,7 +20,7 @@ class Manhattan implements Distance public function distance(array $a, array $b): float { if (count($a) !== count($b)) { - throw InvalidArgumentException::sizeNotMatch(); + throw InvalidArgumentException::arraySizeNotMatch(); } $distance = 0; diff --git a/src/Phpml/Metric/Distance/Minkowski.php b/src/Phpml/Metric/Distance/Minkowski.php index b8ec0ef..c014252 100644 --- a/src/Phpml/Metric/Distance/Minkowski.php +++ b/src/Phpml/Metric/Distance/Minkowski.php @@ -33,7 +33,7 @@ class Minkowski implements Distance public function distance(array $a, array $b): float { if (count($a) !== count($b)) { - throw InvalidArgumentException::sizeNotMatch(); + throw InvalidArgumentException::arraySizeNotMatch(); } $distance = 0; From d9d7895947caa88037012b46ee3a928ee2a9ed8b Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Tue, 19 Apr 2016 22:54:15 +0200 Subject: [PATCH 25/59] force implementation of protected functon in Predictable trait --- src/Phpml/Classifier/KNearestNeighbors.php | 2 +- src/Phpml/Classifier/NaiveBayes.php | 2 +- src/Phpml/Classifier/Traits/Predictable.php | 7 +++++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/Phpml/Classifier/KNearestNeighbors.php b/src/Phpml/Classifier/KNearestNeighbors.php index 4747dc3..ad5aebe 100644 --- a/src/Phpml/Classifier/KNearestNeighbors.php +++ b/src/Phpml/Classifier/KNearestNeighbors.php @@ -44,7 +44,7 @@ class KNearestNeighbors implements Classifier * * @return mixed */ - private function predictSample(array $sample) + protected function predictSample(array $sample) { $distances = $this->kNeighborsDistances($sample); diff --git a/src/Phpml/Classifier/NaiveBayes.php b/src/Phpml/Classifier/NaiveBayes.php index 4eeb865..cf8dcaa 100644 --- a/src/Phpml/Classifier/NaiveBayes.php +++ b/src/Phpml/Classifier/NaiveBayes.php @@ -16,7 +16,7 @@ class NaiveBayes implements Classifier * * @return mixed */ - private function predictSample(array $sample) + protected function predictSample(array $sample) { $predictions = []; foreach ($this->labels as $index => $label) { diff --git a/src/Phpml/Classifier/Traits/Predictable.php b/src/Phpml/Classifier/Traits/Predictable.php index 8090f8c..6f5df6a 100644 --- a/src/Phpml/Classifier/Traits/Predictable.php +++ b/src/Phpml/Classifier/Traits/Predictable.php @@ -24,4 +24,11 @@ trait Predictable return $predicted; } + + /** + * @param array $sample + * + * @return mixed + */ + abstract protected function predictSample(array $sample); } From 6024b1f6642930430a6b14637294c7df930621eb Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Tue, 19 Apr 2016 22:54:33 +0200 Subject: [PATCH 26/59] create SVM class skeleton --- src/Phpml/Classifier/SupportVectorMachine.php | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 src/Phpml/Classifier/SupportVectorMachine.php diff --git a/src/Phpml/Classifier/SupportVectorMachine.php b/src/Phpml/Classifier/SupportVectorMachine.php new file mode 100644 index 0000000..139a062 --- /dev/null +++ b/src/Phpml/Classifier/SupportVectorMachine.php @@ -0,0 +1,22 @@ + Date: Wed, 20 Apr 2016 23:56:33 +0200 Subject: [PATCH 27/59] extract Math namespace --- src/Phpml/Classifier/KNearestNeighbors.php | 4 +-- src/Phpml/Classifier/SupportVectorMachine.php | 35 +++++++++++++++++++ src/Phpml/{Metric => Math}/Distance.php | 2 +- .../{Metric => Math}/Distance/Chebyshev.php | 4 +-- .../{Metric => Math}/Distance/Euclidean.php | 4 +-- .../{Metric => Math}/Distance/Manhattan.php | 4 +-- .../{Metric => Math}/Distance/Minkowski.php | 4 +-- .../Classifier/KNearestNeighborsTest.php | 2 +- .../Distance/ChebyshevTest.php | 2 +- .../Distance/EuclideanTest.php | 2 +- .../Distance/ManhattanTest.php | 2 +- .../Distance/MinkowskiTest.php | 2 +- 12 files changed, 51 insertions(+), 16 deletions(-) rename src/Phpml/{Metric => Math}/Distance.php (89%) rename src/Phpml/{Metric => Math}/Distance/Chebyshev.php (91%) rename src/Phpml/{Metric => Math}/Distance/Euclidean.php (91%) rename src/Phpml/{Metric => Math}/Distance/Manhattan.php (91%) rename src/Phpml/{Metric => Math}/Distance/Minkowski.php (93%) rename tests/Phpml/{Metric => Math}/Distance/ChebyshevTest.php (97%) rename tests/Phpml/{Metric => Math}/Distance/EuclideanTest.php (97%) rename tests/Phpml/{Metric => Math}/Distance/ManhattanTest.php (97%) rename tests/Phpml/{Metric => Math}/Distance/MinkowskiTest.php (97%) diff --git a/src/Phpml/Classifier/KNearestNeighbors.php b/src/Phpml/Classifier/KNearestNeighbors.php index ad5aebe..46733f5 100644 --- a/src/Phpml/Classifier/KNearestNeighbors.php +++ b/src/Phpml/Classifier/KNearestNeighbors.php @@ -6,8 +6,8 @@ namespace Phpml\Classifier; use Phpml\Classifier\Traits\Predictable; use Phpml\Classifier\Traits\Trainable; -use Phpml\Metric\Distance; -use Phpml\Metric\Distance\Euclidean; +use Phpml\Math\Distance; +use Phpml\Math\Distance\Euclidean; class KNearestNeighbors implements Classifier { diff --git a/src/Phpml/Classifier/SupportVectorMachine.php b/src/Phpml/Classifier/SupportVectorMachine.php index 139a062..25d266c 100644 --- a/src/Phpml/Classifier/SupportVectorMachine.php +++ b/src/Phpml/Classifier/SupportVectorMachine.php @@ -11,6 +11,41 @@ class SupportVectorMachine implements Classifier { use Trainable, Predictable; + /** + * @var float + */ + private $gamma; + + /** + * @var float + */ + private $epsilon; + + /** + * @var float + */ + private $tolerance; + + /** + * @var int + */ + private $upperBound; + + /** + * @param float $gamma + * @param float $epsilon + * @param float $tolerance + * @param int $upperBound + */ + public function __construct(float $gamma = .5, float $epsilon = .001, float $tolerance = .001, int $upperBound = 100) + { + $this->gamma = $gamma; + $this->epsilon = $epsilon; + $this->tolerance = $tolerance; + $this->upperBound = $upperBound; + } + + /** * @param array $sample * diff --git a/src/Phpml/Metric/Distance.php b/src/Phpml/Math/Distance.php similarity index 89% rename from src/Phpml/Metric/Distance.php rename to src/Phpml/Math/Distance.php index b590f61..ed929b3 100644 --- a/src/Phpml/Metric/Distance.php +++ b/src/Phpml/Math/Distance.php @@ -2,7 +2,7 @@ declare (strict_types = 1); -namespace Phpml\Metric; +namespace Phpml\Math; interface Distance { diff --git a/src/Phpml/Metric/Distance/Chebyshev.php b/src/Phpml/Math/Distance/Chebyshev.php similarity index 91% rename from src/Phpml/Metric/Distance/Chebyshev.php rename to src/Phpml/Math/Distance/Chebyshev.php index 0dab5e8..386d98a 100644 --- a/src/Phpml/Metric/Distance/Chebyshev.php +++ b/src/Phpml/Math/Distance/Chebyshev.php @@ -2,10 +2,10 @@ declare (strict_types = 1); -namespace Phpml\Metric\Distance; +namespace Phpml\Math\Distance; use Phpml\Exception\InvalidArgumentException; -use Phpml\Metric\Distance; +use Phpml\Math\Distance; class Chebyshev implements Distance { diff --git a/src/Phpml/Metric/Distance/Euclidean.php b/src/Phpml/Math/Distance/Euclidean.php similarity index 91% rename from src/Phpml/Metric/Distance/Euclidean.php rename to src/Phpml/Math/Distance/Euclidean.php index da3d2e3..cca8f2b 100644 --- a/src/Phpml/Metric/Distance/Euclidean.php +++ b/src/Phpml/Math/Distance/Euclidean.php @@ -2,10 +2,10 @@ declare (strict_types = 1); -namespace Phpml\Metric\Distance; +namespace Phpml\Math\Distance; use Phpml\Exception\InvalidArgumentException; -use Phpml\Metric\Distance; +use Phpml\Math\Distance; class Euclidean implements Distance { diff --git a/src/Phpml/Metric/Distance/Manhattan.php b/src/Phpml/Math/Distance/Manhattan.php similarity index 91% rename from src/Phpml/Metric/Distance/Manhattan.php rename to src/Phpml/Math/Distance/Manhattan.php index f69dc42..8253eb7 100644 --- a/src/Phpml/Metric/Distance/Manhattan.php +++ b/src/Phpml/Math/Distance/Manhattan.php @@ -2,10 +2,10 @@ declare (strict_types = 1); -namespace Phpml\Metric\Distance; +namespace Phpml\Math\Distance; use Phpml\Exception\InvalidArgumentException; -use Phpml\Metric\Distance; +use Phpml\Math\Distance; class Manhattan implements Distance { diff --git a/src/Phpml/Metric/Distance/Minkowski.php b/src/Phpml/Math/Distance/Minkowski.php similarity index 93% rename from src/Phpml/Metric/Distance/Minkowski.php rename to src/Phpml/Math/Distance/Minkowski.php index c014252..5c08c03 100644 --- a/src/Phpml/Metric/Distance/Minkowski.php +++ b/src/Phpml/Math/Distance/Minkowski.php @@ -2,10 +2,10 @@ declare (strict_types = 1); -namespace Phpml\Metric\Distance; +namespace Phpml\Math\Distance; use Phpml\Exception\InvalidArgumentException; -use Phpml\Metric\Distance; +use Phpml\Math\Distance; class Minkowski implements Distance { diff --git a/tests/Phpml/Classifier/KNearestNeighborsTest.php b/tests/Phpml/Classifier/KNearestNeighborsTest.php index 5e8ae58..fd7ebad 100644 --- a/tests/Phpml/Classifier/KNearestNeighborsTest.php +++ b/tests/Phpml/Classifier/KNearestNeighborsTest.php @@ -5,7 +5,7 @@ declare (strict_types = 1); namespace tests\Classifier; use Phpml\Classifier\KNearestNeighbors; -use Phpml\Metric\Distance\Chebyshev; +use Phpml\Math\Distance\Chebyshev; class KNearestNeighborsTest extends \PHPUnit_Framework_TestCase { diff --git a/tests/Phpml/Metric/Distance/ChebyshevTest.php b/tests/Phpml/Math/Distance/ChebyshevTest.php similarity index 97% rename from tests/Phpml/Metric/Distance/ChebyshevTest.php rename to tests/Phpml/Math/Distance/ChebyshevTest.php index 306260c..78fb2a0 100644 --- a/tests/Phpml/Metric/Distance/ChebyshevTest.php +++ b/tests/Phpml/Math/Distance/ChebyshevTest.php @@ -4,7 +4,7 @@ declare (strict_types = 1); namespace tests\Phpml\Metric; -use Phpml\Metric\Distance\Chebyshev; +use Phpml\Math\Distance\Chebyshev; class ChebyshevTest extends \PHPUnit_Framework_TestCase { diff --git a/tests/Phpml/Metric/Distance/EuclideanTest.php b/tests/Phpml/Math/Distance/EuclideanTest.php similarity index 97% rename from tests/Phpml/Metric/Distance/EuclideanTest.php rename to tests/Phpml/Math/Distance/EuclideanTest.php index d9bf449..a3dea3c 100644 --- a/tests/Phpml/Metric/Distance/EuclideanTest.php +++ b/tests/Phpml/Math/Distance/EuclideanTest.php @@ -4,7 +4,7 @@ declare (strict_types = 1); namespace tests\Phpml\Metric; -use Phpml\Metric\Distance\Euclidean; +use Phpml\Math\Distance\Euclidean; class EuclideanTest extends \PHPUnit_Framework_TestCase { diff --git a/tests/Phpml/Metric/Distance/ManhattanTest.php b/tests/Phpml/Math/Distance/ManhattanTest.php similarity index 97% rename from tests/Phpml/Metric/Distance/ManhattanTest.php rename to tests/Phpml/Math/Distance/ManhattanTest.php index 94ce5ca..7d0cf2d 100644 --- a/tests/Phpml/Metric/Distance/ManhattanTest.php +++ b/tests/Phpml/Math/Distance/ManhattanTest.php @@ -4,7 +4,7 @@ declare (strict_types = 1); namespace tests\Phpml\Metric; -use Phpml\Metric\Distance\Manhattan; +use Phpml\Math\Distance\Manhattan; class ManhattanTest extends \PHPUnit_Framework_TestCase { diff --git a/tests/Phpml/Metric/Distance/MinkowskiTest.php b/tests/Phpml/Math/Distance/MinkowskiTest.php similarity index 97% rename from tests/Phpml/Metric/Distance/MinkowskiTest.php rename to tests/Phpml/Math/Distance/MinkowskiTest.php index 78d9ef3..ad9318d 100644 --- a/tests/Phpml/Metric/Distance/MinkowskiTest.php +++ b/tests/Phpml/Math/Distance/MinkowskiTest.php @@ -4,7 +4,7 @@ declare (strict_types = 1); namespace tests\Phpml\Metric; -use Phpml\Metric\Distance\Minkowski; +use Phpml\Math\Distance\Minkowski; class MinkowskiTest extends \PHPUnit_Framework_TestCase { From 34281e40eec2c71cf6a242c5b1f4c708179ee4a4 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Thu, 21 Apr 2016 00:23:03 +0200 Subject: [PATCH 28/59] add scalar product function --- src/Phpml/Math/Product.php | 25 +++++++++++++++++++++++++ tests/Phpml/Math/ProductTest.php | 20 ++++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 src/Phpml/Math/Product.php create mode 100644 tests/Phpml/Math/ProductTest.php diff --git a/src/Phpml/Math/Product.php b/src/Phpml/Math/Product.php new file mode 100644 index 0000000..992eeaa --- /dev/null +++ b/src/Phpml/Math/Product.php @@ -0,0 +1,25 @@ + $value) { + $product += $value * $b[$index]; + } + + return $product; + } + +} diff --git a/tests/Phpml/Math/ProductTest.php b/tests/Phpml/Math/ProductTest.php new file mode 100644 index 0000000..d7aa015 --- /dev/null +++ b/tests/Phpml/Math/ProductTest.php @@ -0,0 +1,20 @@ +assertEquals(10, $product->scalar([2, 3], [-1, 4])); + $this->assertEquals(-0.1, $product->scalar([1, 4, 1], [-2, 0.5, -0.1])); + $this->assertEquals(8, $product->scalar([2], [4])); + } + +} \ No newline at end of file From b30f4cbf11778d22a78d50754b80d68649bae5fb Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Thu, 21 Apr 2016 22:12:45 +0200 Subject: [PATCH 29/59] make scalar function static --- src/Phpml/Math/Product.php | 2 +- tests/Phpml/Math/ProductTest.php | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/Phpml/Math/Product.php b/src/Phpml/Math/Product.php index 992eeaa..3772207 100644 --- a/src/Phpml/Math/Product.php +++ b/src/Phpml/Math/Product.php @@ -12,7 +12,7 @@ class Product * * @return mixed */ - public function scalar(array $a, array $b) + public static function scalar(array $a, array $b) { $product = 0; foreach ($a as $index => $value) { diff --git a/tests/Phpml/Math/ProductTest.php b/tests/Phpml/Math/ProductTest.php index d7aa015..ade9942 100644 --- a/tests/Phpml/Math/ProductTest.php +++ b/tests/Phpml/Math/ProductTest.php @@ -10,11 +10,9 @@ class ProductTest extends \PHPUnit_Framework_TestCase public function testScalarProduct() { - $product = new Product(); - - $this->assertEquals(10, $product->scalar([2, 3], [-1, 4])); - $this->assertEquals(-0.1, $product->scalar([1, 4, 1], [-2, 0.5, -0.1])); - $this->assertEquals(8, $product->scalar([2], [4])); + $this->assertEquals(10, Product::scalar([2, 3], [-1, 4])); + $this->assertEquals(-0.1, Product::scalar([1, 4, 1], [-2, 0.5, -0.1])); + $this->assertEquals(8, Product::scalar([2], [4])); } } \ No newline at end of file From 37782eba98ea62f245b5b6a6fd4674191b884263 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Thu, 21 Apr 2016 22:54:38 +0200 Subject: [PATCH 30/59] implement RBF kernel function --- src/Phpml/Math/Kernel.php | 17 +++++++++++++ src/Phpml/Math/Kernel/RBF.php | 39 +++++++++++++++++++++++++++++ tests/Phpml/Math/Kernel/RBFTest.php | 26 +++++++++++++++++++ 3 files changed, 82 insertions(+) create mode 100644 src/Phpml/Math/Kernel.php create mode 100644 src/Phpml/Math/Kernel/RBF.php create mode 100644 tests/Phpml/Math/Kernel/RBFTest.php diff --git a/src/Phpml/Math/Kernel.php b/src/Phpml/Math/Kernel.php new file mode 100644 index 0000000..776f78c --- /dev/null +++ b/src/Phpml/Math/Kernel.php @@ -0,0 +1,17 @@ +gamma = $gamma; + } + + /** + * @param float $a + * @param float $b + * + * @return float + */ + public function compute($a, $b) + { + $score = 2 * Product::scalar($a, $b); + $squares = Product::scalar($a, $a) + Product::scalar($b, $b); + $result = exp(-$this->gamma * ($squares - $score)); + + return $result; + } + +} diff --git a/tests/Phpml/Math/Kernel/RBFTest.php b/tests/Phpml/Math/Kernel/RBFTest.php new file mode 100644 index 0000000..cc22229 --- /dev/null +++ b/tests/Phpml/Math/Kernel/RBFTest.php @@ -0,0 +1,26 @@ +assertEquals(1, $rbf->compute([1, 2], [1, 2])); + $this->assertEquals(0.97336, $rbf->compute([1, 2, 3], [4, 5, 6]), '', $delta = 0.0001); + $this->assertEquals(0.00011, $rbf->compute([4, 5], [1, 100]), '', $delta = 0.0001); + + $rbf = new RBF($gamma = 0.2); + + $this->assertEquals(1, $rbf->compute([1, 2], [1, 2])); + $this->assertEquals(0.00451, $rbf->compute([1, 2, 3], [4, 5, 6]), '', $delta = 0.0001); + $this->assertEquals(0, $rbf->compute([4, 5], [1, 100])); + } + +} From 118ee0b5fed0f7d004b5da48182f6636e2678113 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Thu, 21 Apr 2016 23:21:08 +0200 Subject: [PATCH 31/59] svm is hard :( --- src/Phpml/Classifier/SupportVectorMachine.php | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/Phpml/Classifier/SupportVectorMachine.php b/src/Phpml/Classifier/SupportVectorMachine.php index 25d266c..e16ebbe 100644 --- a/src/Phpml/Classifier/SupportVectorMachine.php +++ b/src/Phpml/Classifier/SupportVectorMachine.php @@ -6,20 +6,21 @@ namespace Phpml\Classifier; use Phpml\Classifier\Traits\Predictable; use Phpml\Classifier\Traits\Trainable; +use Phpml\Math\Kernel; class SupportVectorMachine implements Classifier { use Trainable, Predictable; /** - * @var float + * @var Kernel */ - private $gamma; + private $kernel; /** * @var float */ - private $epsilon; + private $C; /** * @var float @@ -32,20 +33,23 @@ class SupportVectorMachine implements Classifier private $upperBound; /** - * @param float $gamma - * @param float $epsilon + * @param Kernel $kernel + * @param float $C * @param float $tolerance * @param int $upperBound */ - public function __construct(float $gamma = .5, float $epsilon = .001, float $tolerance = .001, int $upperBound = 100) + public function __construct(Kernel $kernel = null, float $C = 1.0, float $tolerance = .001, int $upperBound = 100) { - $this->gamma = $gamma; - $this->epsilon = $epsilon; + if (null === $kernel) { + $kernel = new Kernel\RBF($gamma = .001); + } + + $this->kernel = $kernel; + $this->C = $C; $this->tolerance = $tolerance; $this->upperBound = $upperBound; } - /** * @param array $sample * From 46da769ca68c89555b5f8727f761979a31198028 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Mon, 25 Apr 2016 20:16:53 +0200 Subject: [PATCH 32/59] typo in variable name --- tests/Phpml/Dataset/Demo/GlassTest.php | 8 ++++---- tests/Phpml/Dataset/Demo/WineTest.php | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/Phpml/Dataset/Demo/GlassTest.php b/tests/Phpml/Dataset/Demo/GlassTest.php index 62cef3a..6f6e177 100644 --- a/tests/Phpml/Dataset/Demo/GlassTest.php +++ b/tests/Phpml/Dataset/Demo/GlassTest.php @@ -10,13 +10,13 @@ class GlassTest extends \PHPUnit_Framework_TestCase { public function testLoadingWineDataset() { - $iris = new Glass(); + $glass = new Glass(); // whole dataset - $this->assertEquals(214, count($iris->getSamples())); - $this->assertEquals(214, count($iris->getLabels())); + $this->assertEquals(214, count($glass->getSamples())); + $this->assertEquals(214, count($glass->getLabels())); // one sample features count - $this->assertEquals(9, count($iris->getSamples()[0])); + $this->assertEquals(9, count($glass->getSamples()[0])); } } diff --git a/tests/Phpml/Dataset/Demo/WineTest.php b/tests/Phpml/Dataset/Demo/WineTest.php index 5aa3a3b..de16483 100644 --- a/tests/Phpml/Dataset/Demo/WineTest.php +++ b/tests/Phpml/Dataset/Demo/WineTest.php @@ -10,13 +10,13 @@ class WineTest extends \PHPUnit_Framework_TestCase { public function testLoadingWineDataset() { - $iris = new Wine(); + $wine = new Wine(); // whole dataset - $this->assertEquals(178, count($iris->getSamples())); - $this->assertEquals(178, count($iris->getLabels())); + $this->assertEquals(178, count($wine->getSamples())); + $this->assertEquals(178, count($wine->getLabels())); // one sample features count - $this->assertEquals(13, count($iris->getSamples()[0])); + $this->assertEquals(13, count($wine->getSamples()[0])); } } From af3b57692fbda8292478842d4e860d0afa4e336b Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Mon, 25 Apr 2016 22:55:34 +0200 Subject: [PATCH 33/59] linear regression is also hard --- src/Phpml/Classifier/SupportVectorMachine.php | 6 +-- src/Phpml/Math/Kernel.php | 5 +-- src/Phpml/Math/Kernel/RBF.php | 4 +- src/Phpml/Math/Product.php | 5 +-- src/Phpml/Regression/LeastSquares.php | 37 +++++++++++++++++++ src/Phpml/Regression/Regression.php | 21 +++++++++++ tests/Phpml/Math/Kernel/RBFTest.php | 5 +-- tests/Phpml/Math/ProductTest.php | 7 ++-- 8 files changed, 72 insertions(+), 18 deletions(-) create mode 100644 src/Phpml/Regression/LeastSquares.php create mode 100644 src/Phpml/Regression/Regression.php diff --git a/src/Phpml/Classifier/SupportVectorMachine.php b/src/Phpml/Classifier/SupportVectorMachine.php index e16ebbe..8ee3731 100644 --- a/src/Phpml/Classifier/SupportVectorMachine.php +++ b/src/Phpml/Classifier/SupportVectorMachine.php @@ -34,9 +34,9 @@ class SupportVectorMachine implements Classifier /** * @param Kernel $kernel - * @param float $C - * @param float $tolerance - * @param int $upperBound + * @param float $C + * @param float $tolerance + * @param int $upperBound */ public function __construct(Kernel $kernel = null, float $C = 1.0, float $tolerance = .001, int $upperBound = 100) { diff --git a/src/Phpml/Math/Kernel.php b/src/Phpml/Math/Kernel.php index 776f78c..953a5fa 100644 --- a/src/Phpml/Math/Kernel.php +++ b/src/Phpml/Math/Kernel.php @@ -1,11 +1,11 @@ features = $features; + $this->targets = $targets; + } + + /** + * @param array $features + * + * @return mixed + */ + public function predict(array $features) + { + } +} diff --git a/src/Phpml/Regression/Regression.php b/src/Phpml/Regression/Regression.php new file mode 100644 index 0000000..34e0b6d --- /dev/null +++ b/src/Phpml/Regression/Regression.php @@ -0,0 +1,21 @@ +assertEquals(0.00451, $rbf->compute([1, 2, 3], [4, 5, 6]), '', $delta = 0.0001); $this->assertEquals(0, $rbf->compute([4, 5], [1, 100])); } - } diff --git a/tests/Phpml/Math/ProductTest.php b/tests/Phpml/Math/ProductTest.php index ade9942..aba0ff2 100644 --- a/tests/Phpml/Math/ProductTest.php +++ b/tests/Phpml/Math/ProductTest.php @@ -1,5 +1,6 @@ assertEquals(10, Product::scalar([2, 3], [-1, 4])); $this->assertEquals(-0.1, Product::scalar([1, 4, 1], [-2, 0.5, -0.1])); $this->assertEquals(8, Product::scalar([2], [4])); } - -} \ No newline at end of file +} From 66dcfcf2b775a0edf01444224a591c9ba75d4bdc Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Wed, 27 Apr 2016 23:04:59 +0200 Subject: [PATCH 34/59] implement standard deviation of population function --- .../Exception/InvalidArgumentException.php | 18 ++++++++ .../Math/Statistic/StandardDeviation.php | 45 +++++++++++++++++++ src/Phpml/Regression/LeastSquares.php | 10 +++++ .../Math/Statistic/StandardDeviationTest.php | 43 ++++++++++++++++++ 4 files changed, 116 insertions(+) create mode 100644 src/Phpml/Math/Statistic/StandardDeviation.php create mode 100644 tests/Phpml/Math/Statistic/StandardDeviationTest.php diff --git a/src/Phpml/Exception/InvalidArgumentException.php b/src/Phpml/Exception/InvalidArgumentException.php index 428f637..6b10a1a 100644 --- a/src/Phpml/Exception/InvalidArgumentException.php +++ b/src/Phpml/Exception/InvalidArgumentException.php @@ -23,4 +23,22 @@ class InvalidArgumentException extends \Exception { return new self(sprintf('%s must be between 0.0 and 1.0', $name)); } + + /** + * @return InvalidArgumentException + */ + public static function arrayCantBeEmpty() + { + return new self('The array has zero elements'); + } + + /** + * @param int $minimumSize + * + * @return InvalidArgumentException + */ + public static function arraySizeToSmall($minimumSize = 2) + { + return new self(sprintf('The array must have at least %s elements', $minimumSize)); + } } diff --git a/src/Phpml/Math/Statistic/StandardDeviation.php b/src/Phpml/Math/Statistic/StandardDeviation.php new file mode 100644 index 0000000..3c57a4a --- /dev/null +++ b/src/Phpml/Math/Statistic/StandardDeviation.php @@ -0,0 +1,45 @@ +assertEquals(1.825, StandardDeviation::population($population), '', $delta); + + //http://www.stat.wmich.edu/s216/book/node126.html + $delta = 0.5; + $population = [7100, 15500, 4400, 4400, 5900, 4600, 8800, 2000, 2750, 2550, 960, 1025]; + $this->assertEquals(4079, StandardDeviation::population($population), '', $delta); + + $population = [9300, 10565, 15000, 15000, 17764, 57000, 65940, 73676, 77006, 93739, 146088, 153260]; + $this->assertEquals(50989, StandardDeviation::population($population), '', $delta); + } + + /** + * @expectedException \Phpml\Exception\InvalidArgumentException + */ + public function testThrowExceptionOnEmptyArrayIfNotSample() + { + StandardDeviation::population([], false); + } + + /** + * @expectedException \Phpml\Exception\InvalidArgumentException + */ + public function testThrowExceptionOnToSmallArray() + { + StandardDeviation::population([1]); + } + +} From cbec77d2478ca5b6c0452cac28829a58f516e775 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Wed, 27 Apr 2016 23:28:01 +0200 Subject: [PATCH 35/59] pearson correlation function --- src/Phpml/Math/Statistic/Correlation.php | 45 +++++++++++++++++++ .../Math/Statistic/StandardDeviation.php | 15 +++---- .../Phpml/Math/Statistic/CorrelationTest.php | 33 ++++++++++++++ .../Math/Statistic/StandardDeviationTest.php | 1 - 4 files changed, 85 insertions(+), 9 deletions(-) create mode 100644 src/Phpml/Math/Statistic/Correlation.php create mode 100644 tests/Phpml/Math/Statistic/CorrelationTest.php diff --git a/src/Phpml/Math/Statistic/Correlation.php b/src/Phpml/Math/Statistic/Correlation.php new file mode 100644 index 0000000..a6047a8 --- /dev/null +++ b/src/Phpml/Math/Statistic/Correlation.php @@ -0,0 +1,45 @@ +assertEquals(-0.641, Correlation::pearson($x, $y), '', $delta); + + //http://www.statisticshowto.com/how-to-compute-pearsons-correlation-coefficients/ + $delta = 0.001; + $x = [43, 21, 25, 42, 57, 59]; + $y = [99, 65, 79, 75, 87, 82]; + $this->assertEquals(0.549, Correlation::pearson($x, $y), '', $delta); + } + + /** + * @expectedException \Phpml\Exception\InvalidArgumentException + */ + public function testThrowExceptionOnInvalidArgumentsForPearsonCorrelation() + { + Correlation::pearson([1, 2, 4], [3, 5]); + } +} diff --git a/tests/Phpml/Math/Statistic/StandardDeviationTest.php b/tests/Phpml/Math/Statistic/StandardDeviationTest.php index ff69f7c..299c979 100644 --- a/tests/Phpml/Math/Statistic/StandardDeviationTest.php +++ b/tests/Phpml/Math/Statistic/StandardDeviationTest.php @@ -39,5 +39,4 @@ class StandardDeviationTest extends \PHPUnit_Framework_TestCase { StandardDeviation::population([1]); } - } From 80a712e8a86cdfdc8bfb8582bf2ef0d894b98b8e Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Wed, 27 Apr 2016 23:51:14 +0200 Subject: [PATCH 36/59] implement Least Squares Regression --- src/Phpml/Math/Statistic/Mean.php | 18 ++++++++++ src/Phpml/Regression/LeastSquares.php | 29 ++++++++++++++-- src/Phpml/Regression/Regression.php | 4 +-- tests/Phpml/Regression/LeastSquaresTest.php | 37 +++++++++++++++++++++ 4 files changed, 84 insertions(+), 4 deletions(-) create mode 100644 src/Phpml/Math/Statistic/Mean.php create mode 100644 tests/Phpml/Regression/LeastSquaresTest.php diff --git a/src/Phpml/Math/Statistic/Mean.php b/src/Phpml/Math/Statistic/Mean.php new file mode 100644 index 0000000..2716b78 --- /dev/null +++ b/src/Phpml/Math/Statistic/Mean.php @@ -0,0 +1,18 @@ +features = $features; $this->targets = $targets; + + $this->computeSlope(); + $this->computeIntercept(); } /** - * @param array $features + * @param float $feature * * @return mixed */ - public function predict(array $features) + public function predict($feature) { + return $this->intercept + ($this->slope * $feature); + } + + private function computeSlope() + { + $correlation = Correlation::pearson($this->features, $this->targets); + $sdX = StandardDeviation::population($this->features); + $sdY = StandardDeviation::population($this->targets); + + $this->slope = $correlation * ($sdY / $sdX); + } + + private function computeIntercept() + { + $meanY = Mean::arithmetic($this->targets); + $meanX = Mean::arithmetic($this->features); + + $this->intercept = $meanY - ($this->slope * $meanX); } } diff --git a/src/Phpml/Regression/Regression.php b/src/Phpml/Regression/Regression.php index 34e0b6d..f1f5c8a 100644 --- a/src/Phpml/Regression/Regression.php +++ b/src/Phpml/Regression/Regression.php @@ -13,9 +13,9 @@ interface Regression public function train(array $features, array $targets); /** - * @param array $features + * @param float $feature * * @return mixed */ - public function predict(array $features); + public function predict($feature); } diff --git a/tests/Phpml/Regression/LeastSquaresTest.php b/tests/Phpml/Regression/LeastSquaresTest.php new file mode 100644 index 0000000..eed7537 --- /dev/null +++ b/tests/Phpml/Regression/LeastSquaresTest.php @@ -0,0 +1,37 @@ +train($features, $targets); + + $this->assertEquals(4.06, $regression->predict(64), '', $delta); + + //http://www.stat.wmich.edu/s216/book/node127.html + $features = [9300, 10565, 15000, 15000, 17764, 57000, 65940, 73676, 77006, 93739, 146088, 153260]; + $targets = [7100, 15500, 4400, 4400, 5900, 4600, 8800, 2000, 2750, 2550, 960, 1025]; + + $regression = new LeastSquares(); + $regression->train($features, $targets); + + $this->assertEquals(7659.35, $regression->predict(9300), '', $delta); + $this->assertEquals(5213.81, $regression->predict(57000), '', $delta); + $this->assertEquals(4188.13, $regression->predict(77006), '', $delta); + $this->assertEquals(7659.35, $regression->predict(9300), '', $delta); + $this->assertEquals(278.66, $regression->predict(153260), '', $delta); + } +} From 52cbda0482a3d6cd790a21408b3fc42620d7a5e9 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Wed, 27 Apr 2016 23:57:05 +0200 Subject: [PATCH 37/59] replace mean calls --- src/Phpml/Math/Statistic/Correlation.php | 4 ++-- src/Phpml/Math/Statistic/StandardDeviation.php | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Phpml/Math/Statistic/Correlation.php b/src/Phpml/Math/Statistic/Correlation.php index a6047a8..1d2e8ac 100644 --- a/src/Phpml/Math/Statistic/Correlation.php +++ b/src/Phpml/Math/Statistic/Correlation.php @@ -23,8 +23,8 @@ class Correlation } $count = count($x); - $meanX = array_sum($x) / $count; - $meanY = array_sum($y) / $count; + $meanX = Mean::arithmetic($x); + $meanY = Mean::arithmetic($y); $axb = 0; $a2 = 0; diff --git a/src/Phpml/Math/Statistic/StandardDeviation.php b/src/Phpml/Math/Statistic/StandardDeviation.php index a05faa4..2b03c54 100644 --- a/src/Phpml/Math/Statistic/StandardDeviation.php +++ b/src/Phpml/Math/Statistic/StandardDeviation.php @@ -28,7 +28,7 @@ class StandardDeviation throw InvalidArgumentException::arraySizeToSmall(2); } - $mean = array_sum($a) / $n; + $mean = Mean::arithmetic($a); $carry = 0.0; foreach ($a as $val) { $d = $val - $mean; From b5e4cbe66e8b4e50353f2fe5a98c97ad2d2098ce Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Wed, 27 Apr 2016 23:57:23 +0200 Subject: [PATCH 38/59] add Mean::arithmetic tests --- tests/Phpml/Math/Statistic/MeanTest.php | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 tests/Phpml/Math/Statistic/MeanTest.php diff --git a/tests/Phpml/Math/Statistic/MeanTest.php b/tests/Phpml/Math/Statistic/MeanTest.php new file mode 100644 index 0000000..2e57ade --- /dev/null +++ b/tests/Phpml/Math/Statistic/MeanTest.php @@ -0,0 +1,17 @@ +assertEquals(3.5, Mean::arithmetic([2, 5]), '', $delta); + $this->assertEquals(41.16, Mean::arithmetic([43, 21, 25, 42, 57, 59]), '', $delta); + } +} From 3e4dc3ddf8c1db9e3f89bcdcde052c4a0309c7f0 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Thu, 28 Apr 2016 07:32:48 +0200 Subject: [PATCH 39/59] add test for mean with floats --- tests/Phpml/Math/Statistic/MeanTest.php | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/Phpml/Math/Statistic/MeanTest.php b/tests/Phpml/Math/Statistic/MeanTest.php index 2e57ade..f0dca3b 100644 --- a/tests/Phpml/Math/Statistic/MeanTest.php +++ b/tests/Phpml/Math/Statistic/MeanTest.php @@ -13,5 +13,6 @@ class MeanTest extends \PHPUnit_Framework_TestCase $delta = 0.01; $this->assertEquals(3.5, Mean::arithmetic([2, 5]), '', $delta); $this->assertEquals(41.16, Mean::arithmetic([43, 21, 25, 42, 57, 59]), '', $delta); + $this->assertEquals(1.7, Mean::arithmetic([0.5, 0.5, 1.5, 2.5, 3.5]), '', $delta); } } From 9d74174a68f7a272963e2c680d440f4cd3dce9d4 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Fri, 29 Apr 2016 23:03:08 +0200 Subject: [PATCH 40/59] ls reg with error :( --- src/Phpml/Regression/LeastSquares.php | 73 ++++++++++++++----- src/Phpml/Regression/Regression.php | 8 +- .../Phpml/Math/Statistic/CorrelationTest.php | 5 ++ tests/Phpml/Regression/LeastSquaresTest.php | 37 +++++++--- 4 files changed, 91 insertions(+), 32 deletions(-) diff --git a/src/Phpml/Regression/LeastSquares.php b/src/Phpml/Regression/LeastSquares.php index 793ae95..622c187 100644 --- a/src/Phpml/Regression/LeastSquares.php +++ b/src/Phpml/Regression/LeastSquares.php @@ -10,6 +10,11 @@ use Phpml\Math\Statistic\Mean; class LeastSquares implements Regression { + /** + * @var array + */ + private $samples; + /** * @var array */ @@ -21,52 +26,86 @@ class LeastSquares implements Regression private $targets; /** - * @var float + * @var array */ - private $slope; + private $slopes; /** - * @var + * @var float */ private $intercept; /** - * @param array $features + * @param array $samples * @param array $targets */ - public function train(array $features, array $targets) + public function train(array $samples, array $targets) { - $this->features = $features; + $this->samples = $samples; $this->targets = $targets; + $this->features = []; - $this->computeSlope(); + $this->computeSlopes(); $this->computeIntercept(); } /** - * @param float $feature + * @param float $sample * * @return mixed */ - public function predict($feature) + public function predict($sample) { - return $this->intercept + ($this->slope * $feature); + $result = $this->intercept; + foreach ($this->slopes as $index => $slope) { + $result += ($slope * $sample[$index]); + } + + return $result; } - private function computeSlope() + /** + * @return array + */ + public function getSlopes() { - $correlation = Correlation::pearson($this->features, $this->targets); - $sdX = StandardDeviation::population($this->features); + return $this->slopes; + } + + private function computeSlopes() + { + $features = count($this->samples[0]); $sdY = StandardDeviation::population($this->targets); - $this->slope = $correlation * ($sdY / $sdX); + for($i=0; $i<$features; $i++) { + $correlation = Correlation::pearson($this->getFeatures($i), $this->targets); + $sdXi = StandardDeviation::population($this->getFeatures($i)); + $this->slopes[] = $correlation * ($sdY / $sdXi); + } } private function computeIntercept() { - $meanY = Mean::arithmetic($this->targets); - $meanX = Mean::arithmetic($this->features); + $this->intercept = Mean::arithmetic($this->targets); + foreach ($this->slopes as $index => $slope) { + $this->intercept -= $slope * Mean::arithmetic($this->getFeatures($index)); + } + } - $this->intercept = $meanY - ($this->slope * $meanX); + /** + * @param $index + * + * @return array + */ + private function getFeatures($index) + { + if(!isset($this->features[$index])) { + $this->features[$index] = []; + foreach ($this->samples as $sample) { + $this->features[$index][] = $sample[$index]; + } + } + + return $this->features[$index]; } } diff --git a/src/Phpml/Regression/Regression.php b/src/Phpml/Regression/Regression.php index f1f5c8a..a7837d4 100644 --- a/src/Phpml/Regression/Regression.php +++ b/src/Phpml/Regression/Regression.php @@ -7,15 +7,15 @@ namespace Phpml\Regression; interface Regression { /** - * @param array $features + * @param array $samples * @param array $targets */ - public function train(array $features, array $targets); + public function train(array $samples, array $targets); /** - * @param float $feature + * @param float $sample * * @return mixed */ - public function predict($feature); + public function predict($sample); } diff --git a/tests/Phpml/Math/Statistic/CorrelationTest.php b/tests/Phpml/Math/Statistic/CorrelationTest.php index 492d38c..948dc16 100644 --- a/tests/Phpml/Math/Statistic/CorrelationTest.php +++ b/tests/Phpml/Math/Statistic/CorrelationTest.php @@ -21,6 +21,11 @@ class CorrelationTest extends \PHPUnit_Framework_TestCase $x = [43, 21, 25, 42, 57, 59]; $y = [99, 65, 79, 75, 87, 82]; $this->assertEquals(0.549, Correlation::pearson($x, $y), '', $delta); + + $delta = 0.001; + $x = [60, 61, 62, 63, 65]; + $y = [3.1, 3.6, 3.8, 4, 4.1]; + $this->assertEquals(0.911, Correlation::pearson($x, $y), '', $delta); } /** diff --git a/tests/Phpml/Regression/LeastSquaresTest.php b/tests/Phpml/Regression/LeastSquaresTest.php index eed7537..d5975d8 100644 --- a/tests/Phpml/Regression/LeastSquaresTest.php +++ b/tests/Phpml/Regression/LeastSquaresTest.php @@ -8,30 +8,45 @@ use Phpml\Regression\LeastSquares; class LeastSquaresTest extends \PHPUnit_Framework_TestCase { - public function testPredictSingleFeature() + public function testPredictSingleFeatureSamples() { $delta = 0.01; //https://www.easycalculation.com/analytical/learn-least-square-regression.php - $features = [60, 61, 62, 63, 65]; + $samples = [[60], [61], [62], [63], [65]]; $targets = [3.1, 3.6, 3.8, 4, 4.1]; $regression = new LeastSquares(); - $regression->train($features, $targets); + $regression->train($samples, $targets); - $this->assertEquals(4.06, $regression->predict(64), '', $delta); + $this->assertEquals(4.06, $regression->predict([64]), '', $delta); //http://www.stat.wmich.edu/s216/book/node127.html - $features = [9300, 10565, 15000, 15000, 17764, 57000, 65940, 73676, 77006, 93739, 146088, 153260]; + $samples = [[9300], [10565], [15000], [15000], [17764], [57000], [65940], [73676], [77006], [93739], [146088], [153260]]; $targets = [7100, 15500, 4400, 4400, 5900, 4600, 8800, 2000, 2750, 2550, 960, 1025]; $regression = new LeastSquares(); - $regression->train($features, $targets); + $regression->train($samples, $targets); - $this->assertEquals(7659.35, $regression->predict(9300), '', $delta); - $this->assertEquals(5213.81, $regression->predict(57000), '', $delta); - $this->assertEquals(4188.13, $regression->predict(77006), '', $delta); - $this->assertEquals(7659.35, $regression->predict(9300), '', $delta); - $this->assertEquals(278.66, $regression->predict(153260), '', $delta); + $this->assertEquals(7659.35, $regression->predict([9300]), '', $delta); + $this->assertEquals(5213.81, $regression->predict([57000]), '', $delta); + $this->assertEquals(4188.13, $regression->predict([77006]), '', $delta); + $this->assertEquals(7659.35, $regression->predict([9300]), '', $delta); + $this->assertEquals(278.66, $regression->predict([153260]), '', $delta); } + + public function testPredictMultiFeaturesSamples() + { + $delta = 0.01; + + //http://www.stat.wmich.edu/s216/book/node129.html + $samples = [[73676, 1996],[77006,1998],[ 10565, 2000],[146088, 1995],[ 15000, 2001],[ 65940, 2000],[ 9300, 2000],[ 93739, 1996],[153260, 1994],[ 17764, 2002],[ 57000, 1998],[ 15000, 2000]]; + $targets = [2000, 2750, 15500, 960, 4400, 8800, 7100, 2550, 1025, 5900, 4600, 4400]; + + $regression = new LeastSquares(); + $regression->train($samples, $targets); + + $this->assertEquals(3807, $regression->predict([60000, 1996]), '', $delta); + } + } From 60c796f5d9e04b0fd22ebb4c41382e8965ed9153 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sat, 30 Apr 2016 00:58:54 +0200 Subject: [PATCH 41/59] create matrix calculation for ls regression for multiple variable --- .../Exception/InvalidArgumentException.php | 17 ++ src/Phpml/Exception/MatrixException.php | 25 ++ src/Phpml/Math/Matrix.php | 253 ++++++++++++++++++ src/Phpml/Regression/LeastSquares.php | 73 ++--- tests/Phpml/Regression/LeastSquaresTest.php | 16 +- 5 files changed, 324 insertions(+), 60 deletions(-) create mode 100644 src/Phpml/Exception/MatrixException.php create mode 100644 src/Phpml/Math/Matrix.php diff --git a/src/Phpml/Exception/InvalidArgumentException.php b/src/Phpml/Exception/InvalidArgumentException.php index 6b10a1a..c852ecf 100644 --- a/src/Phpml/Exception/InvalidArgumentException.php +++ b/src/Phpml/Exception/InvalidArgumentException.php @@ -41,4 +41,21 @@ class InvalidArgumentException extends \Exception { return new self(sprintf('The array must have at least %s elements', $minimumSize)); } + + /** + * @return InvalidArgumentException + */ + public static function matrixDimensionsDidNotMatch() + { + return new self('Matrix dimensions did not match'); + } + + /** + * @return InvalidArgumentException + */ + public static function inconsistentMatrixSupplied() + { + return new self('Inconsistent matrix aupplied'); + } + } diff --git a/src/Phpml/Exception/MatrixException.php b/src/Phpml/Exception/MatrixException.php new file mode 100644 index 0000000..8186f0a --- /dev/null +++ b/src/Phpml/Exception/MatrixException.php @@ -0,0 +1,25 @@ +rows = count($matrix); + $this->columns = count($matrix[0]); + + if($validate) { + for ($i = 0; $i < $this->rows; $i++) { + if (count($matrix[$i]) !== $this->columns) { + throw InvalidArgumentException::matrixDimensionsDidNotMatch(); + } + } + } + + $this->matrix = $matrix; + } + + /** + * @return array + */ + public function toArray() + { + return $this->matrix; + } + + /** + * @return int + */ + public function getRows() + { + return $this->rows; + } + + /** + * @return int + */ + public function getColumns() + { + return $this->columns; + } + + /** + * @param $column + * + * @return array + * + * @throws MatrixException + */ + public function getColumnValues($column) + { + if($column >= $this->columns) { + throw MatrixException::columnOutOfRange(); + } + + $values = []; + for ($i = 0; $i < $this->rows; $i++) { + $values[] = $this->matrix[$i][$column]; + } + + return $values; + } + + /** + * @return float|int + * + * @throws MatrixException + */ + public function getDeterminant() + { + if($this->determinant) { + return $this->determinant; + } + + if (!$this->isSquare()) { + throw MatrixException::notSquareMatrix(); + } + + $determinant = 0; + if ($this->rows == 1 && $this->columns == 1) { + $determinant = $this->matrix[0][0]; + } else if ($this->rows == 2 && $this->columns == 2) { + $determinant = $this->matrix[0][0] * $this->matrix[1][1] - + $this->matrix[0][1] * $this->matrix[1][0]; + } else { + for ($j = 0; $j < $this->columns; $j++) { + $subMatrix = $this->crossOut(0, $j); + if (fmod($j, 2) == 0) { + $determinant += $this->matrix[0][$j] * $subMatrix->getDeterminant(); + } else { + $determinant -= $this->matrix[0][$j] * $subMatrix->getDeterminant(); + } + } + } + + return $this->determinant = $determinant; + } + + /** + * @return bool + */ + public function isSquare() + { + return $this->columns === $this->rows; + } + + /** + * @return Matrix + */ + public function transpose() + { + $newMatrix = []; + for ($i = 0; $i < $this->rows; $i++) { + for ($j = 0; $j < $this->columns; $j++) { + $newMatrix[$j][$i] = $this->matrix[$i][$j]; + } + } + + return new self($newMatrix, false); + } + + /** + * @param Matrix $matrix + * + * @return Matrix + * + * @throws InvalidArgumentException + */ + public function multiply(Matrix $matrix) + { + if ($this->columns != $matrix->getRows()) { + throw InvalidArgumentException::inconsistentMatrixSupplied(); + } + + $product = []; + $multiplier = $matrix->toArray(); + for ($i = 0; $i < $this->rows; $i++) { + for ($j = 0; $j < $matrix->getColumns(); $j++) { + $product[$i][$j] = 0; + for ($k = 0; $k < $this->columns; $k++) { + $product[$i][$j] += $this->matrix[$i][$k] * $multiplier[$k][$j]; + } + } + } + return new self($product, false); + } + + /** + * @param $value + * + * @return Matrix + */ + public function divideByScalar($value) + { + $newMatrix = array(); + for ($i = 0; $i < $this->rows; $i++) { + for ($j = 0; $j < $this->columns; $j++) { + $newMatrix[$i][$j] = $this->matrix[$i][$j] / $value; + } + } + + return new self($newMatrix, false); + } + + /** + * @return Matrix + * + * @throws MatrixException + */ + public function inverse() + { + if (!$this->isSquare()) { + throw MatrixException::notSquareMatrix(); + } + + $newMatrix = array(); + for ($i = 0; $i < $this->rows; $i++) { + for ($j = 0; $j < $this->columns; $j++) { + $subMatrix = $this->crossOut($i, $j); + if (fmod($i + $j, 2) == 0) { + $newMatrix[$i][$j] = ($subMatrix->getDeterminant()); + } else { + $newMatrix[$i][$j] = -($subMatrix->getDeterminant()); + } + } + } + + $cofactorMatrix = new self($newMatrix, false); + + return $cofactorMatrix->transpose()->divideByScalar($this->getDeterminant()); + } + + /** + * @param int $row + * @param int $column + * + * @return Matrix + */ + public function crossOut(int $row, int $column) + { + $newMatrix = []; + $r = 0; + for ($i = 0; $i < $this->rows; $i++) { + $c = 0; + if ($row != $i) { + for ($j = 0; $j < $this->columns; $j++) { + if ($column != $j) { + $newMatrix[$r][$c] = $this->matrix[$i][$j]; + $c++; + } + } + $r++; + } + } + + return new self($newMatrix, false); + } + +} diff --git a/src/Phpml/Regression/LeastSquares.php b/src/Phpml/Regression/LeastSquares.php index 622c187..34dc745 100644 --- a/src/Phpml/Regression/LeastSquares.php +++ b/src/Phpml/Regression/LeastSquares.php @@ -4,9 +4,7 @@ declare (strict_types = 1); namespace Phpml\Regression; -use Phpml\Math\Statistic\Correlation; -use Phpml\Math\Statistic\StandardDeviation; -use Phpml\Math\Statistic\Mean; +use Phpml\Math\Matrix; class LeastSquares implements Regression { @@ -15,26 +13,21 @@ class LeastSquares implements Regression */ private $samples; - /** - * @var array - */ - private $features; - /** * @var array */ private $targets; - /** - * @var array - */ - private $slopes; - /** * @var float */ private $intercept; + /** + * @var array + */ + private $coefficients; + /** * @param array $samples * @param array $targets @@ -43,22 +36,20 @@ class LeastSquares implements Regression { $this->samples = $samples; $this->targets = $targets; - $this->features = []; - $this->computeSlopes(); - $this->computeIntercept(); + $this->computeCoefficients(); } /** - * @param float $sample + * @param array $sample * * @return mixed */ public function predict($sample) { $result = $this->intercept; - foreach ($this->slopes as $index => $slope) { - $result += ($slope * $sample[$index]); + foreach ($this->coefficients as $index => $coefficient) { + $result += $coefficient * $sample[$index]; } return $result; @@ -67,45 +58,23 @@ class LeastSquares implements Regression /** * @return array */ - public function getSlopes() + public function getCoefficients() { - return $this->slopes; - } - - private function computeSlopes() - { - $features = count($this->samples[0]); - $sdY = StandardDeviation::population($this->targets); - - for($i=0; $i<$features; $i++) { - $correlation = Correlation::pearson($this->getFeatures($i), $this->targets); - $sdXi = StandardDeviation::population($this->getFeatures($i)); - $this->slopes[] = $correlation * ($sdY / $sdXi); - } - } - - private function computeIntercept() - { - $this->intercept = Mean::arithmetic($this->targets); - foreach ($this->slopes as $index => $slope) { - $this->intercept -= $slope * Mean::arithmetic($this->getFeatures($index)); - } + return $this->coefficients; } /** - * @param $index - * - * @return array + * coefficient(b) = (X'X)-1X'Y */ - private function getFeatures($index) + private function computeCoefficients() { - if(!isset($this->features[$index])) { - $this->features[$index] = []; - foreach ($this->samples as $sample) { - $this->features[$index][] = $sample[$index]; - } - } + $samplesMatrix = new Matrix($this->samples); + $targetsMatrix = new Matrix($this->targets); - return $this->features[$index]; + $ts = $samplesMatrix->transpose()->multiply($samplesMatrix)->inverse(); + $tf = $samplesMatrix->transpose()->multiply($targetsMatrix); + + $this->coefficients = $ts->multiply($tf)->getColumnValues(0); + $this->intercept = array_shift($this->coefficients); } } diff --git a/tests/Phpml/Regression/LeastSquaresTest.php b/tests/Phpml/Regression/LeastSquaresTest.php index d5975d8..8859ba9 100644 --- a/tests/Phpml/Regression/LeastSquaresTest.php +++ b/tests/Phpml/Regression/LeastSquaresTest.php @@ -13,8 +13,8 @@ class LeastSquaresTest extends \PHPUnit_Framework_TestCase $delta = 0.01; //https://www.easycalculation.com/analytical/learn-least-square-regression.php - $samples = [[60], [61], [62], [63], [65]]; - $targets = [3.1, 3.6, 3.8, 4, 4.1]; + $samples = [[1, 60], [1, 61], [1, 62], [1, 63], [1, 65]]; + $targets = [[3.1], [3.6], [3.8], [4], [4.1]]; $regression = new LeastSquares(); $regression->train($samples, $targets); @@ -22,8 +22,8 @@ class LeastSquaresTest extends \PHPUnit_Framework_TestCase $this->assertEquals(4.06, $regression->predict([64]), '', $delta); //http://www.stat.wmich.edu/s216/book/node127.html - $samples = [[9300], [10565], [15000], [15000], [17764], [57000], [65940], [73676], [77006], [93739], [146088], [153260]]; - $targets = [7100, 15500, 4400, 4400, 5900, 4600, 8800, 2000, 2750, 2550, 960, 1025]; + $samples = [[1 ,9300], [1, 10565], [1, 15000], [1, 15000], [1, 17764], [1, 57000], [1, 65940], [1, 73676], [1, 77006], [1, 93739], [1, 146088], [1, 153260]]; + $targets = [[7100], [15500], [4400], [4400], [5900], [4600], [8800], [2000], [2750], [2550], [960], [1025]]; $regression = new LeastSquares(); $regression->train($samples, $targets); @@ -37,16 +37,16 @@ class LeastSquaresTest extends \PHPUnit_Framework_TestCase public function testPredictMultiFeaturesSamples() { - $delta = 0.01; + $delta = 1; //http://www.stat.wmich.edu/s216/book/node129.html - $samples = [[73676, 1996],[77006,1998],[ 10565, 2000],[146088, 1995],[ 15000, 2001],[ 65940, 2000],[ 9300, 2000],[ 93739, 1996],[153260, 1994],[ 17764, 2002],[ 57000, 1998],[ 15000, 2000]]; - $targets = [2000, 2750, 15500, 960, 4400, 8800, 7100, 2550, 1025, 5900, 4600, 4400]; + $samples = [[1, 73676, 1996],[1, 77006, 1998],[1, 10565, 2000],[1, 146088, 1995],[1, 15000, 2001],[1, 65940, 2000],[1, 9300, 2000],[1, 93739, 1996],[1, 153260, 1994],[1, 17764, 2002],[1, 57000, 1998],[1, 15000, 2000]]; + $targets = [[2000], [ 2750], [15500], [ 960], [ 4400], [ 8800], [ 7100], [ 2550], [ 1025], [ 5900], [ 4600], [ 4400]]; $regression = new LeastSquares(); $regression->train($samples, $targets); - $this->assertEquals(3807, $regression->predict([60000, 1996]), '', $delta); + $this->assertEquals(4094, $regression->predict([60000, 1996]), '', $delta); } } From 633974fea06d469e55d3456341c035cb18605bbf Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sat, 30 Apr 2016 00:59:10 +0200 Subject: [PATCH 42/59] php-cs-fxier --- .../Exception/InvalidArgumentException.php | 1 - src/Phpml/Exception/MatrixException.php | 1 - src/Phpml/Math/Matrix.php | 49 ++++++++++--------- src/Phpml/Regression/LeastSquares.php | 2 +- tests/Phpml/Regression/LeastSquaresTest.php | 7 ++- 5 files changed, 29 insertions(+), 31 deletions(-) diff --git a/src/Phpml/Exception/InvalidArgumentException.php b/src/Phpml/Exception/InvalidArgumentException.php index c852ecf..9e88250 100644 --- a/src/Phpml/Exception/InvalidArgumentException.php +++ b/src/Phpml/Exception/InvalidArgumentException.php @@ -57,5 +57,4 @@ class InvalidArgumentException extends \Exception { return new self('Inconsistent matrix aupplied'); } - } diff --git a/src/Phpml/Exception/MatrixException.php b/src/Phpml/Exception/MatrixException.php index 8186f0a..440a0ac 100644 --- a/src/Phpml/Exception/MatrixException.php +++ b/src/Phpml/Exception/MatrixException.php @@ -21,5 +21,4 @@ class MatrixException extends \Exception { return new self('Column out of range'); } - } diff --git a/src/Phpml/Math/Matrix.php b/src/Phpml/Math/Matrix.php index 20d6561..a071fd9 100644 --- a/src/Phpml/Math/Matrix.php +++ b/src/Phpml/Math/Matrix.php @@ -1,5 +1,6 @@ rows = count($matrix); $this->columns = count($matrix[0]); - if($validate) { - for ($i = 0; $i < $this->rows; $i++) { + if ($validate) { + for ($i = 0; $i < $this->rows; ++$i) { if (count($matrix[$i]) !== $this->columns) { throw InvalidArgumentException::matrixDimensionsDidNotMatch(); } @@ -83,12 +84,12 @@ class Matrix */ public function getColumnValues($column) { - if($column >= $this->columns) { + if ($column >= $this->columns) { throw MatrixException::columnOutOfRange(); } $values = []; - for ($i = 0; $i < $this->rows; $i++) { + for ($i = 0; $i < $this->rows; ++$i) { $values[] = $this->matrix[$i][$column]; } @@ -102,7 +103,7 @@ class Matrix */ public function getDeterminant() { - if($this->determinant) { + if ($this->determinant) { return $this->determinant; } @@ -113,11 +114,11 @@ class Matrix $determinant = 0; if ($this->rows == 1 && $this->columns == 1) { $determinant = $this->matrix[0][0]; - } else if ($this->rows == 2 && $this->columns == 2) { + } elseif ($this->rows == 2 && $this->columns == 2) { $determinant = $this->matrix[0][0] * $this->matrix[1][1] - $this->matrix[0][1] * $this->matrix[1][0]; } else { - for ($j = 0; $j < $this->columns; $j++) { + for ($j = 0; $j < $this->columns; ++$j) { $subMatrix = $this->crossOut(0, $j); if (fmod($j, 2) == 0) { $determinant += $this->matrix[0][$j] * $subMatrix->getDeterminant(); @@ -130,7 +131,7 @@ class Matrix return $this->determinant = $determinant; } - /** + /** * @return bool */ public function isSquare() @@ -144,8 +145,8 @@ class Matrix public function transpose() { $newMatrix = []; - for ($i = 0; $i < $this->rows; $i++) { - for ($j = 0; $j < $this->columns; $j++) { + for ($i = 0; $i < $this->rows; ++$i) { + for ($j = 0; $j < $this->columns; ++$j) { $newMatrix[$j][$i] = $this->matrix[$i][$j]; } } @@ -168,14 +169,15 @@ class Matrix $product = []; $multiplier = $matrix->toArray(); - for ($i = 0; $i < $this->rows; $i++) { - for ($j = 0; $j < $matrix->getColumns(); $j++) { + for ($i = 0; $i < $this->rows; ++$i) { + for ($j = 0; $j < $matrix->getColumns(); ++$j) { $product[$i][$j] = 0; - for ($k = 0; $k < $this->columns; $k++) { + for ($k = 0; $k < $this->columns; ++$k) { $product[$i][$j] += $this->matrix[$i][$k] * $multiplier[$k][$j]; } } } + return new self($product, false); } @@ -187,8 +189,8 @@ class Matrix public function divideByScalar($value) { $newMatrix = array(); - for ($i = 0; $i < $this->rows; $i++) { - for ($j = 0; $j < $this->columns; $j++) { + for ($i = 0; $i < $this->rows; ++$i) { + for ($j = 0; $j < $this->columns; ++$j) { $newMatrix[$i][$j] = $this->matrix[$i][$j] / $value; } } @@ -208,8 +210,8 @@ class Matrix } $newMatrix = array(); - for ($i = 0; $i < $this->rows; $i++) { - for ($j = 0; $j < $this->columns; $j++) { + for ($i = 0; $i < $this->rows; ++$i) { + for ($j = 0; $j < $this->columns; ++$j) { $subMatrix = $this->crossOut($i, $j); if (fmod($i + $j, 2) == 0) { $newMatrix[$i][$j] = ($subMatrix->getDeterminant()); @@ -234,20 +236,19 @@ class Matrix { $newMatrix = []; $r = 0; - for ($i = 0; $i < $this->rows; $i++) { + for ($i = 0; $i < $this->rows; ++$i) { $c = 0; if ($row != $i) { - for ($j = 0; $j < $this->columns; $j++) { + for ($j = 0; $j < $this->columns; ++$j) { if ($column != $j) { $newMatrix[$r][$c] = $this->matrix[$i][$j]; - $c++; + ++$c; } } - $r++; + ++$r; } } return new self($newMatrix, false); } - } diff --git a/src/Phpml/Regression/LeastSquares.php b/src/Phpml/Regression/LeastSquares.php index 34dc745..0b7f4cc 100644 --- a/src/Phpml/Regression/LeastSquares.php +++ b/src/Phpml/Regression/LeastSquares.php @@ -64,7 +64,7 @@ class LeastSquares implements Regression } /** - * coefficient(b) = (X'X)-1X'Y + * coefficient(b) = (X'X)-1X'Y. */ private function computeCoefficients() { diff --git a/tests/Phpml/Regression/LeastSquaresTest.php b/tests/Phpml/Regression/LeastSquaresTest.php index 8859ba9..148cc4b 100644 --- a/tests/Phpml/Regression/LeastSquaresTest.php +++ b/tests/Phpml/Regression/LeastSquaresTest.php @@ -22,7 +22,7 @@ class LeastSquaresTest extends \PHPUnit_Framework_TestCase $this->assertEquals(4.06, $regression->predict([64]), '', $delta); //http://www.stat.wmich.edu/s216/book/node127.html - $samples = [[1 ,9300], [1, 10565], [1, 15000], [1, 15000], [1, 17764], [1, 57000], [1, 65940], [1, 73676], [1, 77006], [1, 93739], [1, 146088], [1, 153260]]; + $samples = [[1, 9300], [1, 10565], [1, 15000], [1, 15000], [1, 17764], [1, 57000], [1, 65940], [1, 73676], [1, 77006], [1, 93739], [1, 146088], [1, 153260]]; $targets = [[7100], [15500], [4400], [4400], [5900], [4600], [8800], [2000], [2750], [2550], [960], [1025]]; $regression = new LeastSquares(); @@ -40,13 +40,12 @@ class LeastSquaresTest extends \PHPUnit_Framework_TestCase $delta = 1; //http://www.stat.wmich.edu/s216/book/node129.html - $samples = [[1, 73676, 1996],[1, 77006, 1998],[1, 10565, 2000],[1, 146088, 1995],[1, 15000, 2001],[1, 65940, 2000],[1, 9300, 2000],[1, 93739, 1996],[1, 153260, 1994],[1, 17764, 2002],[1, 57000, 1998],[1, 15000, 2000]]; - $targets = [[2000], [ 2750], [15500], [ 960], [ 4400], [ 8800], [ 7100], [ 2550], [ 1025], [ 5900], [ 4600], [ 4400]]; + $samples = [[1, 73676, 1996], [1, 77006, 1998], [1, 10565, 2000], [1, 146088, 1995], [1, 15000, 2001], [1, 65940, 2000], [1, 9300, 2000], [1, 93739, 1996], [1, 153260, 1994], [1, 17764, 2002], [1, 57000, 1998], [1, 15000, 2000]]; + $targets = [[2000], [2750], [15500], [960], [4400], [8800], [7100], [2550], [1025], [5900], [4600], [4400]]; $regression = new LeastSquares(); $regression->train($samples, $targets); $this->assertEquals(4094, $regression->predict([60000, 1996]), '', $delta); } - } From b1c47d5e9dd288b48c1b2984105bc61ecd9c4726 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sat, 30 Apr 2016 13:32:40 +0200 Subject: [PATCH 43/59] test intercept and coefficients of linear regression --- src/Phpml/Regression/LeastSquares.php | 8 ++++++++ tests/Phpml/Regression/LeastSquaresTest.php | 7 +++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/Phpml/Regression/LeastSquares.php b/src/Phpml/Regression/LeastSquares.php index 0b7f4cc..af755c5 100644 --- a/src/Phpml/Regression/LeastSquares.php +++ b/src/Phpml/Regression/LeastSquares.php @@ -63,6 +63,14 @@ class LeastSquares implements Regression return $this->coefficients; } + /** + * @return float + */ + public function getIntercept() + { + return $this->intercept; + } + /** * coefficient(b) = (X'X)-1X'Y. */ diff --git a/tests/Phpml/Regression/LeastSquaresTest.php b/tests/Phpml/Regression/LeastSquaresTest.php index 148cc4b..7544417 100644 --- a/tests/Phpml/Regression/LeastSquaresTest.php +++ b/tests/Phpml/Regression/LeastSquaresTest.php @@ -37,7 +37,7 @@ class LeastSquaresTest extends \PHPUnit_Framework_TestCase public function testPredictMultiFeaturesSamples() { - $delta = 1; + $delta = 0.01; //http://www.stat.wmich.edu/s216/book/node129.html $samples = [[1, 73676, 1996], [1, 77006, 1998], [1, 10565, 2000], [1, 146088, 1995], [1, 15000, 2001], [1, 65940, 2000], [1, 9300, 2000], [1, 93739, 1996], [1, 153260, 1994], [1, 17764, 2002], [1, 57000, 1998], [1, 15000, 2000]]; @@ -46,6 +46,9 @@ class LeastSquaresTest extends \PHPUnit_Framework_TestCase $regression = new LeastSquares(); $regression->train($samples, $targets); - $this->assertEquals(4094, $regression->predict([60000, 1996]), '', $delta); + $this->assertEquals(-800614.957, $regression->getIntercept(), '', $delta); + $this->assertEquals([-0.0327, 404.14], $regression->getCoefficients(), '', $delta); + $this->assertEquals(4094.82, $regression->predict([60000, 1996]), '', $delta); + $this->assertEquals(5711.40, $regression->predict([60000, 2000]), '', $delta); } } From ff79de7e14cc4d009e488fddfaa8e3add9193ed8 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sat, 30 Apr 2016 13:54:01 +0200 Subject: [PATCH 44/59] better arguments format for regression --- src/Phpml/Math/Matrix.php | 25 ++++++++++++---- src/Phpml/Regression/LeastSquares.php | 32 +++++++++++++++++++-- tests/Phpml/Regression/LeastSquaresTest.php | 12 ++++---- 3 files changed, 55 insertions(+), 14 deletions(-) diff --git a/src/Phpml/Math/Matrix.php b/src/Phpml/Math/Matrix.php index a071fd9..aeab3bc 100644 --- a/src/Phpml/Math/Matrix.php +++ b/src/Phpml/Math/Matrix.php @@ -51,6 +51,21 @@ class Matrix $this->matrix = $matrix; } + /** + * @param array $array + * + * @return Matrix + */ + public static function fromFlatArray(array $array) + { + $matrix = []; + foreach ($array as $value) { + $matrix[] = [$value]; + } + + return new self($matrix); + } + /** * @return array */ @@ -115,16 +130,14 @@ class Matrix if ($this->rows == 1 && $this->columns == 1) { $determinant = $this->matrix[0][0]; } elseif ($this->rows == 2 && $this->columns == 2) { - $determinant = $this->matrix[0][0] * $this->matrix[1][1] - + $determinant = + $this->matrix[0][0] * $this->matrix[1][1] - $this->matrix[0][1] * $this->matrix[1][0]; } else { for ($j = 0; $j < $this->columns; ++$j) { $subMatrix = $this->crossOut(0, $j); - if (fmod($j, 2) == 0) { - $determinant += $this->matrix[0][$j] * $subMatrix->getDeterminant(); - } else { - $determinant -= $this->matrix[0][$j] * $subMatrix->getDeterminant(); - } + $minor = $this->matrix[0][$j] * $subMatrix->getDeterminant(); + $determinant += fmod($j, 2) == 0 ? $minor : -$minor; } } diff --git a/src/Phpml/Regression/LeastSquares.php b/src/Phpml/Regression/LeastSquares.php index af755c5..cd0251f 100644 --- a/src/Phpml/Regression/LeastSquares.php +++ b/src/Phpml/Regression/LeastSquares.php @@ -76,8 +76,8 @@ class LeastSquares implements Regression */ private function computeCoefficients() { - $samplesMatrix = new Matrix($this->samples); - $targetsMatrix = new Matrix($this->targets); + $samplesMatrix = $this->getSamplesMatrix(); + $targetsMatrix = $this->getTargetsMatrix(); $ts = $samplesMatrix->transpose()->multiply($samplesMatrix)->inverse(); $tf = $samplesMatrix->transpose()->multiply($targetsMatrix); @@ -85,4 +85,32 @@ class LeastSquares implements Regression $this->coefficients = $ts->multiply($tf)->getColumnValues(0); $this->intercept = array_shift($this->coefficients); } + + /** + * Add one dimension for intercept calculation. + * + * @return Matrix + */ + private function getSamplesMatrix() + { + $samples = []; + foreach ($this->samples as $sample) { + array_unshift($sample, 1); + $samples[] = $sample; + } + + return new Matrix($samples); + } + + /** + * @return Matrix + */ + private function getTargetsMatrix() + { + if (is_array($this->targets[0])) { + return new Matrix($this->targets); + } + + return Matrix::fromFlatArray($this->targets); + } } diff --git a/tests/Phpml/Regression/LeastSquaresTest.php b/tests/Phpml/Regression/LeastSquaresTest.php index 7544417..a9b4882 100644 --- a/tests/Phpml/Regression/LeastSquaresTest.php +++ b/tests/Phpml/Regression/LeastSquaresTest.php @@ -13,8 +13,8 @@ class LeastSquaresTest extends \PHPUnit_Framework_TestCase $delta = 0.01; //https://www.easycalculation.com/analytical/learn-least-square-regression.php - $samples = [[1, 60], [1, 61], [1, 62], [1, 63], [1, 65]]; - $targets = [[3.1], [3.6], [3.8], [4], [4.1]]; + $samples = [[60], [61], [62], [63], [65]]; + $targets = [3.1, 3.6, 3.8, 4, 4.1]; $regression = new LeastSquares(); $regression->train($samples, $targets); @@ -22,8 +22,8 @@ class LeastSquaresTest extends \PHPUnit_Framework_TestCase $this->assertEquals(4.06, $regression->predict([64]), '', $delta); //http://www.stat.wmich.edu/s216/book/node127.html - $samples = [[1, 9300], [1, 10565], [1, 15000], [1, 15000], [1, 17764], [1, 57000], [1, 65940], [1, 73676], [1, 77006], [1, 93739], [1, 146088], [1, 153260]]; - $targets = [[7100], [15500], [4400], [4400], [5900], [4600], [8800], [2000], [2750], [2550], [960], [1025]]; + $samples = [[9300], [10565], [15000], [15000], [17764], [57000], [65940], [73676], [77006], [93739], [146088], [153260]]; + $targets = [7100, 15500, 4400, 4400, 5900, 4600, 8800, 2000, 2750, 2550, 960, 1025]; $regression = new LeastSquares(); $regression->train($samples, $targets); @@ -40,8 +40,8 @@ class LeastSquaresTest extends \PHPUnit_Framework_TestCase $delta = 0.01; //http://www.stat.wmich.edu/s216/book/node129.html - $samples = [[1, 73676, 1996], [1, 77006, 1998], [1, 10565, 2000], [1, 146088, 1995], [1, 15000, 2001], [1, 65940, 2000], [1, 9300, 2000], [1, 93739, 1996], [1, 153260, 1994], [1, 17764, 2002], [1, 57000, 1998], [1, 15000, 2000]]; - $targets = [[2000], [2750], [15500], [960], [4400], [8800], [7100], [2550], [1025], [5900], [4600], [4400]]; + $samples = [[73676, 1996], [77006, 1998], [10565, 2000], [146088, 1995], [15000, 2001], [65940, 2000], [9300, 2000], [93739, 1996], [153260, 1994], [17764, 2002], [57000, 1998], [15000, 2000]]; + $targets = [2000, 2750, 15500, 960, 4400, 8800, 7100, 2550, 1025, 5900, 4600, 4400]; $regression = new LeastSquares(); $regression->train($samples, $targets); From ff9adc267cd074fa1c0b0aca8418b7d6d51efcc4 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sat, 30 Apr 2016 13:54:58 +0200 Subject: [PATCH 45/59] better arguments format for regression --- tests/Phpml/Regression/LeastSquaresTest.php | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/Phpml/Regression/LeastSquaresTest.php b/tests/Phpml/Regression/LeastSquaresTest.php index a9b4882..8bd444f 100644 --- a/tests/Phpml/Regression/LeastSquaresTest.php +++ b/tests/Phpml/Regression/LeastSquaresTest.php @@ -35,6 +35,20 @@ class LeastSquaresTest extends \PHPUnit_Framework_TestCase $this->assertEquals(278.66, $regression->predict([153260]), '', $delta); } + public function testPredictSingleFeatureSamplesWithMatrixTargets() + { + $delta = 0.01; + + //https://www.easycalculation.com/analytical/learn-least-square-regression.php + $samples = [[60], [61], [62], [63], [65]]; + $targets = [[3.1], [3.6], [3.8], [4], [4.1]]; + + $regression = new LeastSquares(); + $regression->train($samples, $targets); + + $this->assertEquals(4.06, $regression->predict([64]), '', $delta); + } + public function testPredictMultiFeaturesSamples() { $delta = 0.01; From ee9bb7b252d97934faa791bf439f873960f3ba15 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sat, 30 Apr 2016 23:21:32 +0200 Subject: [PATCH 46/59] add tests for matrix class --- tests/Phpml/Math/MatrixTest.php | 176 ++++++++++++++++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 tests/Phpml/Math/MatrixTest.php diff --git a/tests/Phpml/Math/MatrixTest.php b/tests/Phpml/Math/MatrixTest.php new file mode 100644 index 0000000..64bb903 --- /dev/null +++ b/tests/Phpml/Math/MatrixTest.php @@ -0,0 +1,176 @@ +assertInstanceOf(Matrix::class, $matrix); + $this->assertEquals([[1], [2], [3], [4]], $matrix->toArray()); + $this->assertEquals(4, $matrix->getRows()); + $this->assertEquals(1, $matrix->getColumns()); + $this->assertEquals($flatArray, $matrix->getColumnValues(0)); + } + + /** + * @expectedException \Phpml\Exception\MatrixException + */ + public function testThrowExceptionOnInvalidColumnNumber() + { + $matrix = new Matrix([[1, 2, 3], [4, 5, 6]]); + $matrix->getColumnValues(4); + } + + /** + * @expectedException \Phpml\Exception\MatrixException + */ + public function testThrowExceptionOnGetDeterminantIfArrayIsNotSquare() + { + $matrix = new Matrix([[1, 2, 3], [4, 5, 6]]); + $matrix->getDeterminant(); + } + + public function testGetMatrixDeterminant() + { + //http://matrix.reshish.com/determinant.php + $matrix = new Matrix([ + [3, 3, 3], + [4, 2, 1], + [5, 6, 7], + ]); + $this->assertEquals(-3, $matrix->getDeterminant()); + + $matrix = new Matrix([ + [1, 2, 3, 3, 2, 1], + [1 / 2, 5, 6, 7, 1, 1], + [3 / 2, 7 / 2, 2, 0, 6, 8], + [1, 8, 10, 1, 2, 2], + [1 / 4, 4, 1, 0, 2, 3 / 7], + [1, 8, 7, 5, 4, 4 / 5], + ]); + $this->assertEquals(1116.5035, $matrix->getDeterminant(), '', $delta = 0.0001); + } + + public function testMatrixTranspose() + { + $matrix = new Matrix([ + [3, 3, 3], + [4, 2, 1], + [5, 6, 7], + ]); + + $transposedMatrix = [ + [3, 4, 5], + [3, 2, 6], + [3, 1, 7], + ]; + + $this->assertEquals($transposedMatrix, $matrix->transpose()->toArray()); + } + + /** + * @expectedException \Phpml\Exception\InvalidArgumentException + */ + public function testThrowExceptionOnMultiplyWhenInconsistentMatrixSupplied() + { + $matrix1 = new Matrix([[1, 2, 3], [4, 5, 6]]); + $matrix2 = new Matrix([[3, 2, 1], [6, 5, 4]]); + + $matrix1->multiply($matrix2); + } + + public function testMatrixMultiplyByMatrix() + { + $matrix1 = new Matrix([ + [1, 2, 3], + [4, 5, 6], + ]); + + $matrix2 = new Matrix([ + [7, 8], + [9, 10], + [11, 12], + ]); + + $product = [ + [58, 64], + [139, 154], + ]; + + $this->assertEquals($product, $matrix1->multiply($matrix2)->toArray()); + } + + public function testDivideByScalar() + { + $matrix = new Matrix([ + [4, 6, 8], + [2, 10, 20], + ]); + + $quotient = [ + [2, 3, 4], + [1, 5, 10], + ]; + + $this->assertEquals($quotient, $matrix->divideByScalar(2)->toArray()); + } + + /** + * @expectedException \Phpml\Exception\MatrixException + */ + public function testThrowExceptionWhenInverseIfArrayIsNotSquare() + { + $matrix = new Matrix([[1, 2, 3], [4, 5, 6]]); + $matrix->inverse(); + } + + public function testInverseMatrix() + { + //http://ncalculators.com/matrix/inverse-matrix.htm + $matrix = new Matrix([ + [3, 4, 2], + [4, 5, 5], + [1, 1, 1], + ]); + + $inverseMatrix = [ + [0, -1, 5], + [1 / 2, 1 / 2, -7 / 2], + [-1 / 2, 1 / 2, -1 / 2], + ]; + + $this->assertEquals($inverseMatrix, $matrix->inverse()->toArray(), '', $delta = 0.0001); + } + + public function testCrossOutMatrix() + { + $matrix = new Matrix([ + [3, 4, 2], + [4, 5, 5], + [1, 1, 1], + ]); + + $crossOuted = [ + [3, 2], + [1, 1], + ]; + + $this->assertEquals($crossOuted, $matrix->crossOut(1, 1)->toArray()); + } +} From f7b91bea72b385f46ff57d3d8c926f975dbbd09a Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sat, 30 Apr 2016 23:45:21 +0200 Subject: [PATCH 47/59] change Classifier namespace to Classification --- src/Phpml/{Classifier => Classification}/Classifier.php | 2 +- .../{Classifier => Classification}/KNearestNeighbors.php | 6 +++--- src/Phpml/{Classifier => Classification}/NaiveBayes.php | 6 +++--- .../{Classifier => Classification}/SupportVectorMachine.php | 6 +++--- .../{Classifier => Classification}/Traits/Predictable.php | 2 +- .../{Classifier => Classification}/Traits/Trainable.php | 2 +- .../KNearestNeighborsTest.php | 4 ++-- .../Phpml/{Classifier => Classification}/NaiveBayesTest.php | 4 ++-- 8 files changed, 16 insertions(+), 16 deletions(-) rename src/Phpml/{Classifier => Classification}/Classifier.php (90%) rename src/Phpml/{Classifier => Classification}/KNearestNeighbors.php (93%) rename src/Phpml/{Classifier => Classification}/NaiveBayes.php (85%) rename src/Phpml/{Classifier => Classification}/SupportVectorMachine.php (89%) rename src/Phpml/{Classifier => Classification}/Traits/Predictable.php (94%) rename src/Phpml/{Classifier => Classification}/Traits/Trainable.php (90%) rename tests/Phpml/{Classifier => Classification}/KNearestNeighborsTest.php (96%) rename tests/Phpml/{Classifier => Classification}/NaiveBayesTest.php (94%) diff --git a/src/Phpml/Classifier/Classifier.php b/src/Phpml/Classification/Classifier.php similarity index 90% rename from src/Phpml/Classifier/Classifier.php rename to src/Phpml/Classification/Classifier.php index 90250a9..00e6779 100644 --- a/src/Phpml/Classifier/Classifier.php +++ b/src/Phpml/Classification/Classifier.php @@ -2,7 +2,7 @@ declare (strict_types = 1); -namespace Phpml\Classifier; +namespace Phpml\Classification; interface Classifier { diff --git a/src/Phpml/Classifier/KNearestNeighbors.php b/src/Phpml/Classification/KNearestNeighbors.php similarity index 93% rename from src/Phpml/Classifier/KNearestNeighbors.php rename to src/Phpml/Classification/KNearestNeighbors.php index 46733f5..93991ae 100644 --- a/src/Phpml/Classifier/KNearestNeighbors.php +++ b/src/Phpml/Classification/KNearestNeighbors.php @@ -2,10 +2,10 @@ declare (strict_types = 1); -namespace Phpml\Classifier; +namespace Phpml\Classification; -use Phpml\Classifier\Traits\Predictable; -use Phpml\Classifier\Traits\Trainable; +use Phpml\Classification\Traits\Predictable; +use Phpml\Classification\Traits\Trainable; use Phpml\Math\Distance; use Phpml\Math\Distance\Euclidean; diff --git a/src/Phpml/Classifier/NaiveBayes.php b/src/Phpml/Classification/NaiveBayes.php similarity index 85% rename from src/Phpml/Classifier/NaiveBayes.php rename to src/Phpml/Classification/NaiveBayes.php index cf8dcaa..ae98e1d 100644 --- a/src/Phpml/Classifier/NaiveBayes.php +++ b/src/Phpml/Classification/NaiveBayes.php @@ -2,10 +2,10 @@ declare (strict_types = 1); -namespace Phpml\Classifier; +namespace Phpml\Classification; -use Phpml\Classifier\Traits\Predictable; -use Phpml\Classifier\Traits\Trainable; +use Phpml\Classification\Traits\Predictable; +use Phpml\Classification\Traits\Trainable; class NaiveBayes implements Classifier { diff --git a/src/Phpml/Classifier/SupportVectorMachine.php b/src/Phpml/Classification/SupportVectorMachine.php similarity index 89% rename from src/Phpml/Classifier/SupportVectorMachine.php rename to src/Phpml/Classification/SupportVectorMachine.php index 8ee3731..7b0d854 100644 --- a/src/Phpml/Classifier/SupportVectorMachine.php +++ b/src/Phpml/Classification/SupportVectorMachine.php @@ -2,10 +2,10 @@ declare (strict_types = 1); -namespace Phpml\Classifier; +namespace Phpml\Classification; -use Phpml\Classifier\Traits\Predictable; -use Phpml\Classifier\Traits\Trainable; +use Phpml\Classification\Traits\Predictable; +use Phpml\Classification\Traits\Trainable; use Phpml\Math\Kernel; class SupportVectorMachine implements Classifier diff --git a/src/Phpml/Classifier/Traits/Predictable.php b/src/Phpml/Classification/Traits/Predictable.php similarity index 94% rename from src/Phpml/Classifier/Traits/Predictable.php rename to src/Phpml/Classification/Traits/Predictable.php index 6f5df6a..804b54a 100644 --- a/src/Phpml/Classifier/Traits/Predictable.php +++ b/src/Phpml/Classification/Traits/Predictable.php @@ -2,7 +2,7 @@ declare (strict_types = 1); -namespace Phpml\Classifier\Traits; +namespace Phpml\Classification\Traits; trait Predictable { diff --git a/src/Phpml/Classifier/Traits/Trainable.php b/src/Phpml/Classification/Traits/Trainable.php similarity index 90% rename from src/Phpml/Classifier/Traits/Trainable.php rename to src/Phpml/Classification/Traits/Trainable.php index ee7d6fc..8fa97f2 100644 --- a/src/Phpml/Classifier/Traits/Trainable.php +++ b/src/Phpml/Classification/Traits/Trainable.php @@ -2,7 +2,7 @@ declare (strict_types = 1); -namespace Phpml\Classifier\Traits; +namespace Phpml\Classification\Traits; trait Trainable { diff --git a/tests/Phpml/Classifier/KNearestNeighborsTest.php b/tests/Phpml/Classification/KNearestNeighborsTest.php similarity index 96% rename from tests/Phpml/Classifier/KNearestNeighborsTest.php rename to tests/Phpml/Classification/KNearestNeighborsTest.php index fd7ebad..1f6f99f 100644 --- a/tests/Phpml/Classifier/KNearestNeighborsTest.php +++ b/tests/Phpml/Classification/KNearestNeighborsTest.php @@ -2,9 +2,9 @@ declare (strict_types = 1); -namespace tests\Classifier; +namespace tests\Classification; -use Phpml\Classifier\KNearestNeighbors; +use Phpml\Classification\KNearestNeighbors; use Phpml\Math\Distance\Chebyshev; class KNearestNeighborsTest extends \PHPUnit_Framework_TestCase diff --git a/tests/Phpml/Classifier/NaiveBayesTest.php b/tests/Phpml/Classification/NaiveBayesTest.php similarity index 94% rename from tests/Phpml/Classifier/NaiveBayesTest.php rename to tests/Phpml/Classification/NaiveBayesTest.php index ce52bbc..3482cf5 100644 --- a/tests/Phpml/Classifier/NaiveBayesTest.php +++ b/tests/Phpml/Classification/NaiveBayesTest.php @@ -2,9 +2,9 @@ declare (strict_types = 1); -namespace tests\Classifier; +namespace tests\Classification; -use Phpml\Classifier\NaiveBayes; +use Phpml\Classification\NaiveBayes; class NaiveBayesTest extends \PHPUnit_Framework_TestCase { From 2952557028dba99afac9c9dd5425b6665fb0634f Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sat, 30 Apr 2016 23:47:35 +0200 Subject: [PATCH 48/59] improve matrix inverse --- src/Phpml/Math/Matrix.php | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/Phpml/Math/Matrix.php b/src/Phpml/Math/Matrix.php index aeab3bc..ee8fd3c 100644 --- a/src/Phpml/Math/Matrix.php +++ b/src/Phpml/Math/Matrix.php @@ -53,7 +53,7 @@ class Matrix /** * @param array $array - * + * * @return Matrix */ public static function fromFlatArray(array $array) @@ -225,12 +225,8 @@ class Matrix $newMatrix = array(); for ($i = 0; $i < $this->rows; ++$i) { for ($j = 0; $j < $this->columns; ++$j) { - $subMatrix = $this->crossOut($i, $j); - if (fmod($i + $j, 2) == 0) { - $newMatrix[$i][$j] = ($subMatrix->getDeterminant()); - } else { - $newMatrix[$i][$j] = -($subMatrix->getDeterminant()); - } + $minor = $this->crossOut($i, $j)->getDeterminant(); + $newMatrix[$i][$j] = fmod($i + $j, 2) == 0 ? $minor : -$minor; } } From 650e7dd20d65b68ce36ebc341dc086b75e210462 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sat, 30 Apr 2016 23:54:05 +0200 Subject: [PATCH 49/59] simply getDeterminant method --- src/Phpml/Math/Matrix.php | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/Phpml/Math/Matrix.php b/src/Phpml/Math/Matrix.php index ee8fd3c..7c04119 100644 --- a/src/Phpml/Math/Matrix.php +++ b/src/Phpml/Math/Matrix.php @@ -126,6 +126,16 @@ class Matrix throw MatrixException::notSquareMatrix(); } + return $this->determinant = $this->calculateDeterminant(); + } + + /** + * @return float|int + * + * @throws MatrixException + */ + private function calculateDeterminant() + { $determinant = 0; if ($this->rows == 1 && $this->columns == 1) { $determinant = $this->matrix[0][0]; @@ -141,7 +151,7 @@ class Matrix } } - return $this->determinant = $determinant; + return $determinant; } /** From 22963114c309fe084040a7902bd02646a941923b Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sun, 1 May 2016 00:47:44 +0200 Subject: [PATCH 50/59] dbscan clustering algorithm --- src/Phpml/Clustering/Clusterer.php | 16 ++++ src/Phpml/Clustering/DBSCAN.php | 103 ++++++++++++++++++++++++++ tests/Phpml/Clustering/DBSCANTest.php | 39 ++++++++++ 3 files changed, 158 insertions(+) create mode 100644 src/Phpml/Clustering/Clusterer.php create mode 100644 src/Phpml/Clustering/DBSCAN.php create mode 100644 tests/Phpml/Clustering/DBSCANTest.php diff --git a/src/Phpml/Clustering/Clusterer.php b/src/Phpml/Clustering/Clusterer.php new file mode 100644 index 0000000..760c996 --- /dev/null +++ b/src/Phpml/Clustering/Clusterer.php @@ -0,0 +1,16 @@ +epsilon = $epsilon; + $this->minSamples = $minSamples; + $this->distanceMetric = new Distance\Euclidean(); + } + + /** + * @param array $samples + * + * @return array + */ + public function cluster(array $samples) + { + $clusters = []; + $visited = []; + + foreach($samples as $index => $sample) { + if(isset($visited[$index])) { + continue; + } + $visited[$index] = true; + + $regionSamples = $this->getSamplesInRegion($sample, $samples); + if(count($regionSamples) >= $this->minSamples) { + $clusters[] = $this->expandCluster($regionSamples, $visited); + } + } + + return $clusters; + } + + /** + * @param array $localSample + * @param array $samples + * + * @return array + */ + private function getSamplesInRegion($localSample, $samples) { + $region = []; + + foreach($samples as $index => $sample) { + if($this->distanceMetric->distance($localSample, $sample) < $this->epsilon) { + $region[$index] = $sample; + } + } + + return $region; + } + + /** + * @param array $samples + * @param array $visited + * + * @return array + */ + private function expandCluster($samples, &$visited) { + $cluster = []; + + foreach($samples as $index => $sample) { + if(!isset($visited[$index])) { + $visited[$index] = true; + $regionSamples = $this->getSamplesInRegion($sample, $samples); + if(count($regionSamples) > $this->minSamples) { + $cluster = array_merge($regionSamples, $cluster); + } + } + + $cluster[] = $sample; + } + + return $cluster; + } + +} diff --git a/tests/Phpml/Clustering/DBSCANTest.php b/tests/Phpml/Clustering/DBSCANTest.php new file mode 100644 index 0000000..748b87a --- /dev/null +++ b/tests/Phpml/Clustering/DBSCANTest.php @@ -0,0 +1,39 @@ +assertEquals($clustered, $dbscan->cluster($samples)); + } + + public function testDBSCANSamplesInCircleClustering() + { + $samples = [[1, 1],[6, 6],[1, -1],[5, 6],[-1, -1],[7, 8],[-1, 1],[7, 7]]; + + $clustered = [ + [[1, 1],[1, -1],[-1, -1],[-1, 1]], + [[6, 6],[5, 6],[7, 8],[7, 7]] + ]; + + $dbscan = new DBSCAN($epsilon = 3, $minSamples = 4); + + $this->assertEquals($clustered, $dbscan->cluster($samples)); + } + +} From 26be771668919bd87e5b39a34b23b72912ef964a Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sun, 1 May 2016 00:56:03 +0200 Subject: [PATCH 51/59] add distanceMetric for dbscan --- src/Phpml/Clustering/DBSCAN.php | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/Phpml/Clustering/DBSCAN.php b/src/Phpml/Clustering/DBSCAN.php index 2972f98..361520b 100644 --- a/src/Phpml/Clustering/DBSCAN.php +++ b/src/Phpml/Clustering/DBSCAN.php @@ -4,6 +4,7 @@ declare(strict_types = 1); namespace Phpml\Clustering; use Phpml\Math\Distance; +use Phpml\Math\Distance\Euclidean; class DBSCAN implements Clusterer { @@ -25,12 +26,17 @@ class DBSCAN implements Clusterer /** * @param float $epsilon * @param int $minSamples + * @param Distance $distanceMetric */ - public function __construct($epsilon = 0.5, $minSamples = 3) + public function __construct($epsilon = 0.5, $minSamples = 3, Distance $distanceMetric = null) { + if (null === $distanceMetric) { + $distanceMetric = new Euclidean(); + } + $this->epsilon = $epsilon; $this->minSamples = $minSamples; - $this->distanceMetric = new Distance\Euclidean(); + $this->distanceMetric = $distanceMetric; } /** From 01a24997547e0e209502c4df7a16a33a10941a06 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sun, 1 May 2016 00:56:43 +0200 Subject: [PATCH 52/59] cs-fixer --- src/Phpml/Clustering/Clusterer.php | 5 ++--- src/Phpml/Clustering/DBSCAN.php | 30 ++++++++++++++------------- tests/Phpml/Clustering/DBSCANTest.php | 15 +++++++------- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/Phpml/Clustering/Clusterer.php b/src/Phpml/Clustering/Clusterer.php index 760c996..8324b41 100644 --- a/src/Phpml/Clustering/Clusterer.php +++ b/src/Phpml/Clustering/Clusterer.php @@ -1,16 +1,15 @@ $sample) { - if(isset($visited[$index])) { + foreach ($samples as $index => $sample) { + if (isset($visited[$index])) { continue; } $visited[$index] = true; $regionSamples = $this->getSamplesInRegion($sample, $samples); - if(count($regionSamples) >= $this->minSamples) { + if (count($regionSamples) >= $this->minSamples) { $clusters[] = $this->expandCluster($regionSamples, $visited); } } @@ -70,11 +71,12 @@ class DBSCAN implements Clusterer * * @return array */ - private function getSamplesInRegion($localSample, $samples) { + private function getSamplesInRegion($localSample, $samples) + { $region = []; - foreach($samples as $index => $sample) { - if($this->distanceMetric->distance($localSample, $sample) < $this->epsilon) { + foreach ($samples as $index => $sample) { + if ($this->distanceMetric->distance($localSample, $sample) < $this->epsilon) { $region[$index] = $sample; } } @@ -88,14 +90,15 @@ class DBSCAN implements Clusterer * * @return array */ - private function expandCluster($samples, &$visited) { + private function expandCluster($samples, &$visited) + { $cluster = []; - foreach($samples as $index => $sample) { - if(!isset($visited[$index])) { + foreach ($samples as $index => $sample) { + if (!isset($visited[$index])) { $visited[$index] = true; $regionSamples = $this->getSamplesInRegion($sample, $samples); - if(count($regionSamples) > $this->minSamples) { + if (count($regionSamples) > $this->minSamples) { $cluster = array_merge($regionSamples, $cluster); } } @@ -105,5 +108,4 @@ class DBSCAN implements Clusterer return $cluster; } - } diff --git a/tests/Phpml/Clustering/DBSCANTest.php b/tests/Phpml/Clustering/DBSCANTest.php index 748b87a..5952636 100644 --- a/tests/Phpml/Clustering/DBSCANTest.php +++ b/tests/Phpml/Clustering/DBSCANTest.php @@ -1,5 +1,6 @@ assertEquals($clustered, $dbscan->cluster($samples)); } - } From c0513e9b8234a217d5b89ed6f57fcee488003cdd Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sun, 1 May 2016 23:17:09 +0200 Subject: [PATCH 53/59] kmeans clustering --- src/Phpml/Clustering/KMeans.php | 51 +++++ src/Phpml/Clustering/KMeans/Cluster.php | 101 ++++++++ src/Phpml/Clustering/KMeans/Point.php | 95 ++++++++ src/Phpml/Clustering/KMeans/Space.php | 216 ++++++++++++++++++ .../Exception/InvalidArgumentException.php | 9 + tests/Phpml/Clustering/DBSCANTest.php | 7 +- tests/Phpml/Clustering/KMeansTest.php | 58 +++++ 7 files changed, 532 insertions(+), 5 deletions(-) create mode 100644 src/Phpml/Clustering/KMeans.php create mode 100644 src/Phpml/Clustering/KMeans/Cluster.php create mode 100644 src/Phpml/Clustering/KMeans/Point.php create mode 100644 src/Phpml/Clustering/KMeans/Space.php create mode 100644 tests/Phpml/Clustering/KMeansTest.php diff --git a/src/Phpml/Clustering/KMeans.php b/src/Phpml/Clustering/KMeans.php new file mode 100644 index 0000000..7bebe7e --- /dev/null +++ b/src/Phpml/Clustering/KMeans.php @@ -0,0 +1,51 @@ +clustersNumber = $clustersNumber; + } + + /** + * @param array $samples + * + * @return array + */ + public function cluster(array $samples) + { + $space = new Space(count($samples[0])); + foreach ($samples as $sample) { + $space->addPoint($sample); + } + + $clusters = []; + foreach ($space->solve($this->clustersNumber) as $cluster) + { + $clusters[] = $cluster->getPoints(); + } + + return $clusters; + } + +} diff --git a/src/Phpml/Clustering/KMeans/Cluster.php b/src/Phpml/Clustering/KMeans/Cluster.php new file mode 100644 index 0000000..fec6d07 --- /dev/null +++ b/src/Phpml/Clustering/KMeans/Cluster.php @@ -0,0 +1,101 @@ +points = new SplObjectStorage; + } + + /** + * @return array + */ + public function getPoints() + { + $points = []; + foreach ($this->points as $point) { + $points[] = $point->toArray(); + } + + return $points; + } + + public function toArray() + { + $points = array(); + foreach ($this->points as $point) + $points[] = $point->toArray(); + + return array( + 'centroid' => parent::toArray(), + 'points' => $points, + ); + } + + public function attach(Point $point) + { + if ($point instanceof self) + throw new LogicException("cannot attach a cluster to another"); + + $this->points->attach($point); + return $point; + } + + public function detach(Point $point) + { + $this->points->detach($point); + return $point; + } + + public function attachAll(SplObjectStorage $points) + { + $this->points->addAll($points); + } + + public function detachAll(SplObjectStorage $points) + { + $this->points->removeAll($points); + } + + public function updateCentroid() + { + if (!$count = count($this->points)) + return; + + $centroid = $this->space->newPoint(array_fill(0, $this->dimention, 0)); + + foreach ($this->points as $point) + for ($n=0; $n<$this->dimention; $n++) + $centroid->coordinates[$n] += $point->coordinates[$n]; + + for ($n=0; $n<$this->dimention; $n++) + $this->coordinates[$n] = $centroid->coordinates[$n] / $count; + } + + public function getIterator() + { + return $this->points; + } + + public function count() + { + return count($this->points); + } +} diff --git a/src/Phpml/Clustering/KMeans/Point.php b/src/Phpml/Clustering/KMeans/Point.php new file mode 100644 index 0000000..4d888c3 --- /dev/null +++ b/src/Phpml/Clustering/KMeans/Point.php @@ -0,0 +1,95 @@ +space = $space; + $this->dimention = $space->getDimention(); + $this->coordinates = $coordinates; + } + + public function toArray() + { + return $this->coordinates; + } + + public function getDistanceWith(self $point, $precise = true) + { + if ($point->space !== $this->space) + throw new LogicException("can only calculate distances from points in the same space"); + + $distance = 0; + for ($n=0; $n<$this->dimention; $n++) { + $difference = $this->coordinates[$n] - $point->coordinates[$n]; + $distance += $difference * $difference; + } + + return $precise ? sqrt($distance) : $distance; + } + + public function getClosest($points) + { + foreach($points as $point) { + $distance = $this->getDistanceWith($point, false); + + if (!isset($minDistance)) { + $minDistance = $distance; + $minPoint = $point; + continue; + } + + if ($distance < $minDistance) { + $minDistance = $distance; + $minPoint = $point; + } + } + + return $minPoint; + } + + public function belongsTo(Space $space) + { + return $this->space === $space; + } + + public function getSpace() + { + return $this->space; + } + + public function getCoordinates() + { + return $this->coordinates; + } + + public function offsetExists($offset) + { + return isset($this->coordinates[$offset]); + } + + public function offsetGet($offset) + { + return $this->coordinates[$offset]; + } + + public function offsetSet($offset, $value) + { + $this->coordinates[$offset] = $value; + } + + public function offsetUnset($offset) + { + unset($this->coordinates[$offset]); + } +} diff --git a/src/Phpml/Clustering/KMeans/Space.php b/src/Phpml/Clustering/KMeans/Space.php new file mode 100644 index 0000000..090a48b --- /dev/null +++ b/src/Phpml/Clustering/KMeans/Space.php @@ -0,0 +1,216 @@ +dimention = $dimention; + } + + public function toArray() + { + $points = array(); + foreach ($this as $point) + $points[] = $point->toArray(); + + return array('points' => $points); + } + + public function newPoint(array $coordinates) + { + if (count($coordinates) != $this->dimention) + throw new LogicException("(" . implode(',', $coordinates) . ") is not a point of this space"); + + return new Point($this, $coordinates); + } + + public function addPoint(array $coordinates, $data = null) + { + return $this->attach($this->newPoint($coordinates), $data); + } + + public function attach($point, $data = null) + { + if (!$point instanceof Point) + throw new InvalidArgumentException("can only attach points to spaces"); + + return parent::attach($point, $data); + } + + public function getDimention() + { + return $this->dimention; + } + + public function getBoundaries() + { + if (!count($this)) + return false; + + $min = $this->newPoint(array_fill(0, $this->dimention, null)); + $max = $this->newPoint(array_fill(0, $this->dimention, null)); + + foreach ($this as $point) { + for ($n=0; $n < $this->dimention; $n++) { + ($min[$n] > $point[$n] || $min[$n] === null) && $min[$n] = $point[$n]; + ($max[$n] < $point[$n] || $max[$n] === null) && $max[$n] = $point[$n]; + } + } + + return array($min, $max); + } + + public function getRandomPoint(Point $min, Point $max) + { + $point = $this->newPoint(array_fill(0, $this->dimention, null)); + + for ($n=0; $n < $this->dimention; $n++) + $point[$n] = rand($min[$n], $max[$n]); + + return $point; + } + + /** + * @param $nbClusters + * @param int $seed + * @param null $iterationCallback + * @return array|Cluster[] + */ + public function solve($nbClusters, $seed = self::SEED_DEFAULT, $iterationCallback = null) + { + if ($iterationCallback && !is_callable($iterationCallback)) + throw new InvalidArgumentException("invalid iteration callback"); + + // initialize K clusters + $clusters = $this->initializeClusters($nbClusters, $seed); + + // there's only one cluster, clusterization has no meaning + if (count($clusters) == 1) + return $clusters[0]; + + // until convergence is reached + do { + $iterationCallback && $iterationCallback($this, $clusters); + } while ($this->iterate($clusters)); + + // clustering is done. + return $clusters; + } + + protected function initializeClusters($nbClusters, $seed) + { + if ($nbClusters <= 0) + throw new InvalidArgumentException("invalid clusters number"); + + switch ($seed) { + // the default seeding method chooses completely random centroid + case self::SEED_DEFAULT: + // get the space boundaries to avoid placing clusters centroid too far from points + list($min, $max) = $this->getBoundaries(); + + // initialize N clusters with a random point within space boundaries + for ($n=0; $n<$nbClusters; $n++) + $clusters[] = new Cluster($this, $this->getRandomPoint($min, $max)->getCoordinates()); + + break; + + // the DASV seeding method consists of finding good initial centroids for the clusters + case self::SEED_DASV: + // find a random point + $position = rand(1, count($this)); + for ($i=1, $this->rewind(); $i<$position && $this->valid(); $i++, $this->next()); + $clusters[] = new Cluster($this, $this->current()->getCoordinates()); + + // retains the distances between points and their closest clusters + $distances = new SplObjectStorage; + + // create k clusters + for ($i=1; $i<$nbClusters; $i++) { + $sum = 0; + + // for each points, get the distance with the closest centroid already choosen + foreach ($this as $point) { + $distance = $point->getDistanceWith($point->getClosest($clusters)); + $sum += $distances[$point] = $distance; + } + + // choose a new random point using a weighted probability distribution + $sum = rand(0, $sum); + foreach ($this as $point) { + if (($sum -= $distances[$point]) > 0) + continue; + + $clusters[] = new Cluster($this, $point->getCoordinates()); + break; + } + } + + break; + } + + // assing all points to the first cluster + $clusters[0]->attachAll($this); + + return $clusters; + } + + protected function iterate($clusters) + { + $continue = false; + + // migration storages + $attach = new SplObjectStorage; + $detach = new SplObjectStorage; + + // calculate proximity amongst points and clusters + foreach ($clusters as $cluster) { + foreach ($cluster as $point) { + // find the closest cluster + $closest = $point->getClosest($clusters); + + // move the point from its old cluster to its closest + if ($closest !== $cluster) { + isset($attach[$closest]) || $attach[$closest] = new SplObjectStorage; + isset($detach[$cluster]) || $detach[$cluster] = new SplObjectStorage; + + $attach[$closest]->attach($point); + $detach[$cluster]->attach($point); + + $continue = true; + } + } + } + + // perform points migrations + foreach ($attach as $cluster) + $cluster->attachAll($attach[$cluster]); + + foreach ($detach as $cluster) + $cluster->detachAll($detach[$cluster]); + + // update all cluster's centroids + foreach ($clusters as $cluster) + $cluster->updateCentroid(); + + return $continue; + } +} diff --git a/src/Phpml/Exception/InvalidArgumentException.php b/src/Phpml/Exception/InvalidArgumentException.php index 9e88250..3185205 100644 --- a/src/Phpml/Exception/InvalidArgumentException.php +++ b/src/Phpml/Exception/InvalidArgumentException.php @@ -57,4 +57,13 @@ class InvalidArgumentException extends \Exception { return new self('Inconsistent matrix aupplied'); } + + /** + * @return InvalidArgumentException + */ + public static function invalidClustersNumber() + { + return new self('Invalid clusters number'); + } + } diff --git a/tests/Phpml/Clustering/DBSCANTest.php b/tests/Phpml/Clustering/DBSCANTest.php index 5952636..7be5331 100644 --- a/tests/Phpml/Clustering/DBSCANTest.php +++ b/tests/Phpml/Clustering/DBSCANTest.php @@ -11,7 +11,6 @@ class DBSCANTest extends \PHPUnit_Framework_TestCase public function testDBSCANSamplesClustering() { $samples = [[1, 1], [8, 7], [1, 2], [7, 8], [2, 1], [8, 9]]; - $clustered = [ [[1, 1], [1, 2], [2, 1]], [[8, 7], [7, 8], [8, 9]], @@ -20,12 +19,9 @@ class DBSCANTest extends \PHPUnit_Framework_TestCase $dbscan = new DBSCAN($epsilon = 2, $minSamples = 3); $this->assertEquals($clustered, $dbscan->cluster($samples)); - } - public function testDBSCANSamplesInCircleClustering() - { + $samples = [[1, 1], [6, 6], [1, -1], [5, 6], [-1, -1], [7, 8], [-1, 1], [7, 7]]; - $clustered = [ [[1, 1], [1, -1], [-1, -1], [-1, 1]], [[6, 6], [5, 6], [7, 8], [7, 7]], @@ -35,4 +31,5 @@ class DBSCANTest extends \PHPUnit_Framework_TestCase $this->assertEquals($clustered, $dbscan->cluster($samples)); } + } diff --git a/tests/Phpml/Clustering/KMeansTest.php b/tests/Phpml/Clustering/KMeansTest.php new file mode 100644 index 0000000..5c21c89 --- /dev/null +++ b/tests/Phpml/Clustering/KMeansTest.php @@ -0,0 +1,58 @@ +cluster($samples); + + $this->assertEquals(2, count($clusters)); + + foreach ($samples as $index => $sample) { + if(in_array($sample, $clusters[0]) || in_array($sample, $clusters[1])) { + unset($samples[$index]); + } + } + $this->assertEquals(0, count($samples)); + } + + public function testKMeansMoreSamplesClustering() + { + $samples = [ + [80,55],[86,59],[19,85],[41,47],[57,58], + [76,22],[94,60],[13,93],[90,48],[52,54], + [62,46],[88,44],[85,24],[63,14],[51,40], + [75,31],[86,62],[81,95],[47,22],[43,95], + [71,19],[17,65],[69,21],[59,60],[59,12], + [15,22],[49,93],[56,35],[18,20],[39,59], + [50,15],[81,36],[67,62],[32,15],[75,65], + [10,47],[75,18],[13,45],[30,62],[95,79], + [64,11],[92,14],[94,49],[39,13],[60,68], + [62,10],[74,44],[37,42],[97,60],[47,73], + ]; + + $kmeans = new KMeans(4); + $clusters = $kmeans->cluster($samples); + + $this->assertEquals(4, count($clusters)); + + foreach ($samples as $index => $sample) { + for($i=0; $i<4; $i++) { + if(in_array($sample, $clusters[$i])) { + unset($samples[$index]); + } + } + } + $this->assertEquals(0, count($samples)); + } + +} From 7572304d5039585538bc63804e7c0a8dda2a6e75 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sun, 1 May 2016 23:36:33 +0200 Subject: [PATCH 54/59] refactor kmeans subclasses --- src/Phpml/Clustering/KMeans.php | 27 +- src/Phpml/Clustering/KMeans/Cluster.php | 188 +++++---- src/Phpml/Clustering/KMeans/Point.php | 175 ++++---- src/Phpml/Clustering/KMeans/Space.php | 381 ++++++++++-------- .../Exception/InvalidArgumentException.php | 1 - tests/Phpml/Clustering/DBSCANTest.php | 2 - tests/Phpml/Clustering/KMeansTest.php | 31 +- 7 files changed, 465 insertions(+), 340 deletions(-) diff --git a/src/Phpml/Clustering/KMeans.php b/src/Phpml/Clustering/KMeans.php index 7bebe7e..cdae3b5 100644 --- a/src/Phpml/Clustering/KMeans.php +++ b/src/Phpml/Clustering/KMeans.php @@ -1,5 +1,6 @@ clustersNumber = $clustersNumber; + $this->initialization = $initialization; } /** @@ -38,14 +49,12 @@ class KMeans implements Clusterer foreach ($samples as $sample) { $space->addPoint($sample); } - + $clusters = []; - foreach ($space->solve($this->clustersNumber) as $cluster) - { + foreach ($space->solve($this->clustersNumber, $this->initialization) as $cluster) { $clusters[] = $cluster->getPoints(); } - + return $clusters; } - } diff --git a/src/Phpml/Clustering/KMeans/Cluster.php b/src/Phpml/Clustering/KMeans/Cluster.php index fec6d07..5cd974d 100644 --- a/src/Phpml/Clustering/KMeans/Cluster.php +++ b/src/Phpml/Clustering/KMeans/Cluster.php @@ -1,101 +1,137 @@ points = new SplObjectStorage; - } + /** + * @param Space $space + * @param array $coordinates + */ + public function __construct(Space $space, array $coordinates) + { + parent::__construct($coordinates); + $this->space = $space; + $this->points = new SplObjectStorage(); + } - /** - * @return array - */ - public function getPoints() - { - $points = []; - foreach ($this->points as $point) { - $points[] = $point->toArray(); - } + /** + * @return array + */ + public function getPoints() + { + $points = []; + foreach ($this->points as $point) { + $points[] = $point->toArray(); + } - return $points; - } - - public function toArray() - { - $points = array(); - foreach ($this->points as $point) - $points[] = $point->toArray(); + return $points; + } - return array( - 'centroid' => parent::toArray(), - 'points' => $points, - ); - } + /** + * @return array + */ + public function toArray() + { + return array( + 'centroid' => parent::toArray(), + 'points' => $this->getPoints(), + ); + } - public function attach(Point $point) - { - if ($point instanceof self) - throw new LogicException("cannot attach a cluster to another"); + /** + * @param Point $point + * + * @return Point + */ + public function attach(Point $point) + { + if ($point instanceof self) { + throw new LogicException('cannot attach a cluster to another'); + } - $this->points->attach($point); - return $point; - } + $this->points->attach($point); - public function detach(Point $point) - { - $this->points->detach($point); - return $point; - } + return $point; + } - public function attachAll(SplObjectStorage $points) - { - $this->points->addAll($points); - } + /** + * @param Point $point + * + * @return Point + */ + public function detach(Point $point) + { + $this->points->detach($point); - public function detachAll(SplObjectStorage $points) - { - $this->points->removeAll($points); - } + return $point; + } - public function updateCentroid() - { - if (!$count = count($this->points)) - return; + /** + * @param SplObjectStorage $points + */ + public function attachAll(SplObjectStorage $points) + { + $this->points->addAll($points); + } - $centroid = $this->space->newPoint(array_fill(0, $this->dimention, 0)); + /** + * @param SplObjectStorage $points + */ + public function detachAll(SplObjectStorage $points) + { + $this->points->removeAll($points); + } - foreach ($this->points as $point) - for ($n=0; $n<$this->dimention; $n++) - $centroid->coordinates[$n] += $point->coordinates[$n]; + public function updateCentroid() + { + if (!$count = count($this->points)) { + return; + } - for ($n=0; $n<$this->dimention; $n++) - $this->coordinates[$n] = $centroid->coordinates[$n] / $count; - } + $centroid = $this->space->newPoint(array_fill(0, $this->dimension, 0)); - public function getIterator() - { - return $this->points; - } + foreach ($this->points as $point) { + for ($n = 0; $n < $this->dimension; ++$n) { + $centroid->coordinates[$n] += $point->coordinates[$n]; + } + } - public function count() - { - return count($this->points); - } + for ($n = 0; $n < $this->dimension; ++$n) { + $this->coordinates[$n] = $centroid->coordinates[$n] / $count; + } + } + + /** + * @return Point[]|SplObjectStorage + */ + public function getIterator() + { + return $this->points; + } + + /** + * @return mixed + */ + public function count() + { + return count($this->points); + } } diff --git a/src/Phpml/Clustering/KMeans/Point.php b/src/Phpml/Clustering/KMeans/Point.php index 4d888c3..9ff4b45 100644 --- a/src/Phpml/Clustering/KMeans/Point.php +++ b/src/Phpml/Clustering/KMeans/Point.php @@ -1,95 +1,124 @@ space = $space; - $this->dimention = $space->getDimention(); - $this->coordinates = $coordinates; - } + /** + * @var array + */ + protected $coordinates; - public function toArray() - { - return $this->coordinates; - } + /** + * @param array $coordinates + */ + public function __construct(array $coordinates) + { + $this->dimension = count($coordinates); + $this->coordinates = $coordinates; + } - public function getDistanceWith(self $point, $precise = true) - { - if ($point->space !== $this->space) - throw new LogicException("can only calculate distances from points in the same space"); + /** + * @return array + */ + public function toArray() + { + return $this->coordinates; + } - $distance = 0; - for ($n=0; $n<$this->dimention; $n++) { - $difference = $this->coordinates[$n] - $point->coordinates[$n]; - $distance += $difference * $difference; - } + /** + * @param Point $point + * @param bool $precise + * + * @return int|mixed + */ + public function getDistanceWith(self $point, $precise = true) + { + $distance = 0; + for ($n = 0; $n < $this->dimension; ++$n) { + $difference = $this->coordinates[$n] - $point->coordinates[$n]; + $distance += $difference * $difference; + } - return $precise ? sqrt($distance) : $distance; - } + return $precise ? sqrt($distance) : $distance; + } - public function getClosest($points) - { - foreach($points as $point) { - $distance = $this->getDistanceWith($point, false); + /** + * @param $points + * + * @return mixed + */ + public function getClosest($points) + { + foreach ($points as $point) { + $distance = $this->getDistanceWith($point, false); - if (!isset($minDistance)) { - $minDistance = $distance; - $minPoint = $point; - continue; - } + if (!isset($minDistance)) { + $minDistance = $distance; + $minPoint = $point; + continue; + } - if ($distance < $minDistance) { - $minDistance = $distance; - $minPoint = $point; - } - } + if ($distance < $minDistance) { + $minDistance = $distance; + $minPoint = $point; + } + } - return $minPoint; - } + return $minPoint; + } - public function belongsTo(Space $space) - { - return $this->space === $space; - } + /** + * @return array + */ + public function getCoordinates() + { + return $this->coordinates; + } - public function getSpace() - { - return $this->space; - } + /** + * @param mixed $offset + * + * @return bool + */ + public function offsetExists($offset) + { + return isset($this->coordinates[$offset]); + } - public function getCoordinates() - { - return $this->coordinates; - } + /** + * @param mixed $offset + * + * @return mixed + */ + public function offsetGet($offset) + { + return $this->coordinates[$offset]; + } - public function offsetExists($offset) - { - return isset($this->coordinates[$offset]); - } + /** + * @param mixed $offset + * @param mixed $value + */ + public function offsetSet($offset, $value) + { + $this->coordinates[$offset] = $value; + } - public function offsetGet($offset) - { - return $this->coordinates[$offset]; - } - - public function offsetSet($offset, $value) - { - $this->coordinates[$offset] = $value; - } - - public function offsetUnset($offset) - { - unset($this->coordinates[$offset]); - } + /** + * @param mixed $offset + */ + public function offsetUnset($offset) + { + unset($this->coordinates[$offset]); + } } diff --git a/src/Phpml/Clustering/KMeans/Space.php b/src/Phpml/Clustering/KMeans/Space.php index 090a48b..f4465cf 100644 --- a/src/Phpml/Clustering/KMeans/Space.php +++ b/src/Phpml/Clustering/KMeans/Space.php @@ -1,216 +1,271 @@ dimension = $dimension; + } - public function __construct($dimention) - { - if ($dimention < 1) - throw new LogicException("a space dimention cannot be null or negative"); + /** + * @return array + */ + public function toArray() + { + $points = []; + foreach ($this as $point) { + $points[] = $point->toArray(); + } - $this->dimention = $dimention; - } + return ['points' => $points]; + } - public function toArray() - { - $points = array(); - foreach ($this as $point) - $points[] = $point->toArray(); + /** + * @param array $coordinates + * + * @return Point + */ + public function newPoint(array $coordinates) + { + if (count($coordinates) != $this->dimension) { + throw new LogicException('('.implode(',', $coordinates).') is not a point of this space'); + } - return array('points' => $points); - } + return new Point($coordinates); + } - public function newPoint(array $coordinates) - { - if (count($coordinates) != $this->dimention) - throw new LogicException("(" . implode(',', $coordinates) . ") is not a point of this space"); + /** + * @param array $coordinates + * @param null $data + */ + public function addPoint(array $coordinates, $data = null) + { + return $this->attach($this->newPoint($coordinates), $data); + } - return new Point($this, $coordinates); - } + /** + * @param object $point + * @param null $data + */ + public function attach($point, $data = null) + { + if (!$point instanceof Point) { + throw new InvalidArgumentException('can only attach points to spaces'); + } - public function addPoint(array $coordinates, $data = null) - { - return $this->attach($this->newPoint($coordinates), $data); - } + return parent::attach($point, $data); + } - public function attach($point, $data = null) - { - if (!$point instanceof Point) - throw new InvalidArgumentException("can only attach points to spaces"); + /** + * @return int + */ + public function getDimension() + { + return $this->dimension; + } - return parent::attach($point, $data); - } + /** + * @return array|bool + */ + public function getBoundaries() + { + if (!count($this)) { + return false; + } - public function getDimention() - { - return $this->dimention; - } + $min = $this->newPoint(array_fill(0, $this->dimension, null)); + $max = $this->newPoint(array_fill(0, $this->dimension, null)); - public function getBoundaries() - { - if (!count($this)) - return false; + foreach ($this as $point) { + for ($n = 0; $n < $this->dimension; ++$n) { + ($min[$n] > $point[$n] || $min[$n] === null) && $min[$n] = $point[$n]; + ($max[$n] < $point[$n] || $max[$n] === null) && $max[$n] = $point[$n]; + } + } - $min = $this->newPoint(array_fill(0, $this->dimention, null)); - $max = $this->newPoint(array_fill(0, $this->dimention, null)); + return array($min, $max); + } - foreach ($this as $point) { - for ($n=0; $n < $this->dimention; $n++) { - ($min[$n] > $point[$n] || $min[$n] === null) && $min[$n] = $point[$n]; - ($max[$n] < $point[$n] || $max[$n] === null) && $max[$n] = $point[$n]; - } - } + /** + * @param Point $min + * @param Point $max + * + * @return Point + */ + public function getRandomPoint(Point $min, Point $max) + { + $point = $this->newPoint(array_fill(0, $this->dimension, null)); - return array($min, $max); - } + for ($n = 0; $n < $this->dimension; ++$n) { + $point[$n] = rand($min[$n], $max[$n]); + } - public function getRandomPoint(Point $min, Point $max) - { - $point = $this->newPoint(array_fill(0, $this->dimention, null)); + return $point; + } - for ($n=0; $n < $this->dimention; $n++) - $point[$n] = rand($min[$n], $max[$n]); + /** + * @param $nbClusters + * @param int $seed + * @param null $iterationCallback + * + * @return array|Cluster[] + */ + public function solve($nbClusters, $seed = KMeans::INIT_RANDOM, $iterationCallback = null) + { + if ($iterationCallback && !is_callable($iterationCallback)) { + throw new InvalidArgumentException('invalid iteration callback'); + } - return $point; - } + // initialize K clusters + $clusters = $this->initializeClusters($nbClusters, $seed); - /** - * @param $nbClusters - * @param int $seed - * @param null $iterationCallback - * @return array|Cluster[] - */ - public function solve($nbClusters, $seed = self::SEED_DEFAULT, $iterationCallback = null) - { - if ($iterationCallback && !is_callable($iterationCallback)) - throw new InvalidArgumentException("invalid iteration callback"); + // there's only one cluster, clusterization has no meaning + if (count($clusters) == 1) { + return $clusters[0]; + } - // initialize K clusters - $clusters = $this->initializeClusters($nbClusters, $seed); + // until convergence is reached + do { + $iterationCallback && $iterationCallback($this, $clusters); + } while ($this->iterate($clusters)); - // there's only one cluster, clusterization has no meaning - if (count($clusters) == 1) - return $clusters[0]; + // clustering is done. + return $clusters; + } - // until convergence is reached - do { - $iterationCallback && $iterationCallback($this, $clusters); - } while ($this->iterate($clusters)); + /** + * @param $nbClusters + * @param $seed + * + * @return array + */ + protected function initializeClusters($nbClusters, $seed) + { + if ($nbClusters <= 0) { + throw new InvalidArgumentException('invalid clusters number'); + } - // clustering is done. - return $clusters; - } + switch ($seed) { + // the default seeding method chooses completely random centroid + case KMeans::INIT_RANDOM: + // get the space boundaries to avoid placing clusters centroid too far from points + list($min, $max) = $this->getBoundaries(); - protected function initializeClusters($nbClusters, $seed) - { - if ($nbClusters <= 0) - throw new InvalidArgumentException("invalid clusters number"); + // initialize N clusters with a random point within space boundaries + for ($n = 0; $n < $nbClusters; ++$n) { + $clusters[] = new Cluster($this, $this->getRandomPoint($min, $max)->getCoordinates()); + } - switch ($seed) { - // the default seeding method chooses completely random centroid - case self::SEED_DEFAULT: - // get the space boundaries to avoid placing clusters centroid too far from points - list($min, $max) = $this->getBoundaries(); + break; - // initialize N clusters with a random point within space boundaries - for ($n=0; $n<$nbClusters; $n++) - $clusters[] = new Cluster($this, $this->getRandomPoint($min, $max)->getCoordinates()); + // the DASV seeding method consists of finding good initial centroids for the clusters + case KMeans::INIT_KMEANS_PLUS_PLUS: + // find a random point + $position = rand(1, count($this)); + for ($i = 1, $this->rewind(); $i < $position && $this->valid(); $i++, $this->next()); + $clusters[] = new Cluster($this, $this->current()->getCoordinates()); - break; + // retains the distances between points and their closest clusters + $distances = new SplObjectStorage(); - // the DASV seeding method consists of finding good initial centroids for the clusters - case self::SEED_DASV: - // find a random point - $position = rand(1, count($this)); - for ($i=1, $this->rewind(); $i<$position && $this->valid(); $i++, $this->next()); - $clusters[] = new Cluster($this, $this->current()->getCoordinates()); + // create k clusters + for ($i = 1; $i < $nbClusters; ++$i) { + $sum = 0; - // retains the distances between points and their closest clusters - $distances = new SplObjectStorage; + // for each points, get the distance with the closest centroid already choosen + foreach ($this as $point) { + $distance = $point->getDistanceWith($point->getClosest($clusters)); + $sum += $distances[$point] = $distance; + } - // create k clusters - for ($i=1; $i<$nbClusters; $i++) { - $sum = 0; + // choose a new random point using a weighted probability distribution + $sum = rand(0, (int) $sum); + foreach ($this as $point) { + if (($sum -= $distances[$point]) > 0) { + continue; + } - // for each points, get the distance with the closest centroid already choosen - foreach ($this as $point) { - $distance = $point->getDistanceWith($point->getClosest($clusters)); - $sum += $distances[$point] = $distance; - } + $clusters[] = new Cluster($this, $point->getCoordinates()); + break; + } + } - // choose a new random point using a weighted probability distribution - $sum = rand(0, $sum); - foreach ($this as $point) { - if (($sum -= $distances[$point]) > 0) - continue; + break; + } - $clusters[] = new Cluster($this, $point->getCoordinates()); - break; - } - } + // assing all points to the first cluster + $clusters[0]->attachAll($this); - break; - } + return $clusters; + } - // assing all points to the first cluster - $clusters[0]->attachAll($this); + /** + * @param $clusters + * + * @return bool + */ + protected function iterate($clusters) + { + $continue = false; - return $clusters; - } + // migration storages + $attach = new SplObjectStorage(); + $detach = new SplObjectStorage(); - protected function iterate($clusters) - { - $continue = false; + // calculate proximity amongst points and clusters + foreach ($clusters as $cluster) { + foreach ($cluster as $point) { + // find the closest cluster + $closest = $point->getClosest($clusters); - // migration storages - $attach = new SplObjectStorage; - $detach = new SplObjectStorage; + // move the point from its old cluster to its closest + if ($closest !== $cluster) { + isset($attach[$closest]) || $attach[$closest] = new SplObjectStorage(); + isset($detach[$cluster]) || $detach[$cluster] = new SplObjectStorage(); - // calculate proximity amongst points and clusters - foreach ($clusters as $cluster) { - foreach ($cluster as $point) { - // find the closest cluster - $closest = $point->getClosest($clusters); + $attach[$closest]->attach($point); + $detach[$cluster]->attach($point); - // move the point from its old cluster to its closest - if ($closest !== $cluster) { - isset($attach[$closest]) || $attach[$closest] = new SplObjectStorage; - isset($detach[$cluster]) || $detach[$cluster] = new SplObjectStorage; + $continue = true; + } + } + } - $attach[$closest]->attach($point); - $detach[$cluster]->attach($point); + // perform points migrations + foreach ($attach as $cluster) { + $cluster->attachAll($attach[$cluster]); + } - $continue = true; - } - } - } + foreach ($detach as $cluster) { + $cluster->detachAll($detach[$cluster]); + } - // perform points migrations - foreach ($attach as $cluster) - $cluster->attachAll($attach[$cluster]); + // update all cluster's centroids + foreach ($clusters as $cluster) { + $cluster->updateCentroid(); + } - foreach ($detach as $cluster) - $cluster->detachAll($detach[$cluster]); - - // update all cluster's centroids - foreach ($clusters as $cluster) - $cluster->updateCentroid(); - - return $continue; - } + return $continue; + } } diff --git a/src/Phpml/Exception/InvalidArgumentException.php b/src/Phpml/Exception/InvalidArgumentException.php index 3185205..45d532e 100644 --- a/src/Phpml/Exception/InvalidArgumentException.php +++ b/src/Phpml/Exception/InvalidArgumentException.php @@ -65,5 +65,4 @@ class InvalidArgumentException extends \Exception { return new self('Invalid clusters number'); } - } diff --git a/tests/Phpml/Clustering/DBSCANTest.php b/tests/Phpml/Clustering/DBSCANTest.php index 7be5331..be37fff 100644 --- a/tests/Phpml/Clustering/DBSCANTest.php +++ b/tests/Phpml/Clustering/DBSCANTest.php @@ -20,7 +20,6 @@ class DBSCANTest extends \PHPUnit_Framework_TestCase $this->assertEquals($clustered, $dbscan->cluster($samples)); - $samples = [[1, 1], [6, 6], [1, -1], [5, 6], [-1, -1], [7, 8], [-1, 1], [7, 7]]; $clustered = [ [[1, 1], [1, -1], [-1, -1], [-1, 1]], @@ -31,5 +30,4 @@ class DBSCANTest extends \PHPUnit_Framework_TestCase $this->assertEquals($clustered, $dbscan->cluster($samples)); } - } diff --git a/tests/Phpml/Clustering/KMeansTest.php b/tests/Phpml/Clustering/KMeansTest.php index 5c21c89..dae62fd 100644 --- a/tests/Phpml/Clustering/KMeansTest.php +++ b/tests/Phpml/Clustering/KMeansTest.php @@ -1,5 +1,6 @@ assertEquals(2, count($clusters)); foreach ($samples as $index => $sample) { - if(in_array($sample, $clusters[0]) || in_array($sample, $clusters[1])) { + if (in_array($sample, $clusters[0]) || in_array($sample, $clusters[1])) { unset($samples[$index]); } } @@ -28,16 +28,16 @@ class KMeansTest extends \PHPUnit_Framework_TestCase public function testKMeansMoreSamplesClustering() { $samples = [ - [80,55],[86,59],[19,85],[41,47],[57,58], - [76,22],[94,60],[13,93],[90,48],[52,54], - [62,46],[88,44],[85,24],[63,14],[51,40], - [75,31],[86,62],[81,95],[47,22],[43,95], - [71,19],[17,65],[69,21],[59,60],[59,12], - [15,22],[49,93],[56,35],[18,20],[39,59], - [50,15],[81,36],[67,62],[32,15],[75,65], - [10,47],[75,18],[13,45],[30,62],[95,79], - [64,11],[92,14],[94,49],[39,13],[60,68], - [62,10],[74,44],[37,42],[97,60],[47,73], + [80, 55], [86, 59], [19, 85], [41, 47], [57, 58], + [76, 22], [94, 60], [13, 93], [90, 48], [52, 54], + [62, 46], [88, 44], [85, 24], [63, 14], [51, 40], + [75, 31], [86, 62], [81, 95], [47, 22], [43, 95], + [71, 19], [17, 65], [69, 21], [59, 60], [59, 12], + [15, 22], [49, 93], [56, 35], [18, 20], [39, 59], + [50, 15], [81, 36], [67, 62], [32, 15], [75, 65], + [10, 47], [75, 18], [13, 45], [30, 62], [95, 79], + [64, 11], [92, 14], [94, 49], [39, 13], [60, 68], + [62, 10], [74, 44], [37, 42], [97, 60], [47, 73], ]; $kmeans = new KMeans(4); @@ -46,13 +46,12 @@ class KMeansTest extends \PHPUnit_Framework_TestCase $this->assertEquals(4, count($clusters)); foreach ($samples as $index => $sample) { - for($i=0; $i<4; $i++) { - if(in_array($sample, $clusters[$i])) { + for ($i = 0; $i < 4; ++$i) { + if (in_array($sample, $clusters[$i])) { unset($samples[$index]); } } } $this->assertEquals(0, count($samples)); } - } From bb9e1aa4f0ab358a0791dc73633c1e3e3ecc9f1d Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Sun, 1 May 2016 23:44:04 +0200 Subject: [PATCH 55/59] test kmeans init methods --- tests/Phpml/Clustering/KMeansTest.php | 36 +++++++++++---------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/tests/Phpml/Clustering/KMeansTest.php b/tests/Phpml/Clustering/KMeansTest.php index dae62fd..5a85b38 100644 --- a/tests/Phpml/Clustering/KMeansTest.php +++ b/tests/Phpml/Clustering/KMeansTest.php @@ -25,33 +25,27 @@ class KMeansTest extends \PHPUnit_Framework_TestCase $this->assertEquals(0, count($samples)); } - public function testKMeansMoreSamplesClustering() + public function testKMeansInitializationMethods() { $samples = [ - [80, 55], [86, 59], [19, 85], [41, 47], [57, 58], - [76, 22], [94, 60], [13, 93], [90, 48], [52, 54], - [62, 46], [88, 44], [85, 24], [63, 14], [51, 40], - [75, 31], [86, 62], [81, 95], [47, 22], [43, 95], - [71, 19], [17, 65], [69, 21], [59, 60], [59, 12], - [15, 22], [49, 93], [56, 35], [18, 20], [39, 59], - [50, 15], [81, 36], [67, 62], [32, 15], [75, 65], - [10, 47], [75, 18], [13, 45], [30, 62], [95, 79], - [64, 11], [92, 14], [94, 49], [39, 13], [60, 68], - [62, 10], [74, 44], [37, 42], [97, 60], [47, 73], + [180, 155], [186, 159], [119, 185], [141, 147], [157, 158], + [176, 122], [194, 160], [113, 193], [190, 148], [152, 154], + [162, 146], [188, 144], [185, 124], [163, 114], [151, 140], + [175, 131], [186, 162], [181, 195], [147, 122], [143, 195], + [171, 119], [117, 165], [169, 121], [159, 160], [159, 112], + [115, 122], [149, 193], [156, 135], [118, 120], [139, 159], + [150, 115], [181, 136], [167, 162], [132, 115], [175, 165], + [110, 147], [175, 118], [113, 145], [130, 162], [195, 179], + [164, 111], [192, 114], [194, 149], [139, 113], [160, 168], + [162, 110], [174, 144], [137, 142], [197, 160], [147, 173], ]; - $kmeans = new KMeans(4); + $kmeans = new KMeans(4, KMeans::INIT_KMEANS_PLUS_PLUS); $clusters = $kmeans->cluster($samples); - $this->assertEquals(4, count($clusters)); - foreach ($samples as $index => $sample) { - for ($i = 0; $i < 4; ++$i) { - if (in_array($sample, $clusters[$i])) { - unset($samples[$index]); - } - } - } - $this->assertEquals(0, count($samples)); + $kmeans = new KMeans(4, KMeans::INIT_RANDOM); + $clusters = $kmeans->cluster($samples); + $this->assertEquals(4, count($clusters)); } } From 55e73b48e9e487b0fa3ed5b729e8b0a237b822f4 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Mon, 2 May 2016 12:07:53 +0200 Subject: [PATCH 56/59] add example code to readme --- README.md | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index e2fc4b3..749d264 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,21 @@ [![License](https://poser.pugx.org/php-ai/php-ml/license.svg)](https://packagist.org/packages/php-ai/php-ml) [![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/quality-score.png?b=develop)](https://scrutinizer-ci.com/g/php-ai/php-ml/?branch=develop) -Fresh approach to machine learning in PHP. Note that at the moment PHP is not the best choice for machine learning but maybe this will change ... +Fresh approach to Machine Learning in PHP. Note that at the moment PHP is not the best choice for machine learning but maybe this will change ... + +Simple example of classification: +```php +use Phpml\Classifier\KNearestNeighbors; + +$samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]]; +$labels = ['a', 'a', 'a', 'b', 'b', 'b']; + +$classifier = new KNearestNeighbors(); +$classifier->train($samples, $labels); + +$classifier->predict([3, 2]); +// return 'b' +``` ## Documentation @@ -20,14 +34,19 @@ Currently this library is in the process of developing, but You can install it w composer require php-ai/php-ml ``` -## To-Do +## Features -* implements more algorithms -* integration with Lavacharts for data visualization +* Classification +* Regression +* Clustering +* Cross Validation -## Testing +## Contribute -After installation, you can launch the test suite in project root directory (you will need to install dev requirements with composer) +- Issue Tracker: github.com/php-ai/php-ml/issues +- Source Code: github.com/php-ai/php-ml + +After installation, you can launch the test suite in project root directory (you will need to install dev requirements with Composer) ``` bin/phpunit @@ -39,4 +58,4 @@ PHP-ML is released under the MIT Licence. See the bundled LICENSE file for detai ## Author -Arkadiusz Kondas (@ArkadiuszKondas) \ No newline at end of file +Arkadiusz Kondas (@ArkadiuszKondas) From 5950af6072fad4797af815b8254bcd77614989da Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Mon, 2 May 2016 13:49:19 +0200 Subject: [PATCH 57/59] update and refactor documentation --- README.md | 14 ++ docs/index.md | 54 ++++++-- ...estneighbors.md => k-nearest-neighbors.md} | 6 +- .../{naivebayes.md => naive-bayes.md} | 4 +- docs/machine-learning/clustering/dbscan.md | 27 ++++ docs/machine-learning/clustering/k-means.md | 37 +++++ .../{randomsplit.md => random-split.md} | 0 docs/machine-learning/datasets/csv-dataset.md | 2 +- docs/machine-learning/datasets/demo/iris.md | 2 +- docs/machine-learning/metric/accuracy.md | 2 +- .../metric/distance/chebyshev.md | 3 - .../metric/distance/euclidean.md | 16 --- .../metric/distance/manhattan.md | 16 --- .../metric/distance/minkowski.md | 1 - .../regression/least-squares.md | 51 +++++++ docs/math/distance.md | 109 +++++++++++++++ docs/math/matrix.md | 129 ++++++++++++++++++ mkdocs.yml | 23 ++-- 18 files changed, 434 insertions(+), 62 deletions(-) rename docs/machine-learning/classification/{knearestneighbors.md => k-nearest-neighbors.md} (79%) rename docs/machine-learning/classification/{naivebayes.md => naive-bayes.md} (86%) create mode 100644 docs/machine-learning/clustering/dbscan.md create mode 100644 docs/machine-learning/clustering/k-means.md rename docs/machine-learning/cross-validation/{randomsplit.md => random-split.md} (100%) delete mode 100644 docs/machine-learning/metric/distance/chebyshev.md delete mode 100644 docs/machine-learning/metric/distance/euclidean.md delete mode 100644 docs/machine-learning/metric/distance/manhattan.md delete mode 100644 docs/machine-learning/metric/distance/minkowski.md create mode 100644 docs/machine-learning/regression/least-squares.md create mode 100644 docs/math/distance.md create mode 100644 docs/math/matrix.md diff --git a/README.md b/README.md index 749d264..71f7570 100644 --- a/README.md +++ b/README.md @@ -37,9 +37,23 @@ composer require php-ai/php-ml ## Features * Classification + * [k-Nearest Neighbors](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/k-nearest-neighbors/) + * [Naive Bayes](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/naive-bayes/) * Regression + * [Least Squares](http://php-ml.readthedocs.io/en/latest/machine-learning/regression/least-squares/) * Clustering + * [k-Means](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/k-means) + * [DBSCAN](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/dbscan) * Cross Validation + * [Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/random-split) +* Datasets + * [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset) + * Ready to use: + * [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/) +* Math + * [Distance](http://php-ml.readthedocs.io/en/latest/math/distance/) + * [Matrix](http://php-ml.readthedocs.io/en/latest/math/matrix/) + ## Contribute diff --git a/docs/index.md b/docs/index.md index c3e2703..71f7570 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,11 +1,30 @@ -# PHP Machine Learning (PHP-ML) +# PHP Machine Learning library [![Build Status](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/build.png?b=develop)](https://scrutinizer-ci.com/g/php-ai/php-ml/build-status/develop) +[![Documentation Status](https://readthedocs.org/projects/php-ml/badge/?version=develop)](http://php-ml.readthedocs.org/en/develop/?badge=develop) [![Total Downloads](https://poser.pugx.org/php-ai/php-ml/downloads.svg)](https://packagist.org/packages/php-ai/php-ml) [![License](https://poser.pugx.org/php-ai/php-ml/license.svg)](https://packagist.org/packages/php-ai/php-ml) [![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/php-ai/php-ml/badges/quality-score.png?b=develop)](https://scrutinizer-ci.com/g/php-ai/php-ml/?branch=develop) -Fresh approach to machine learning in PHP. Note that at the moment PHP is not the best choice for machine learning but maybe this will change ... +Fresh approach to Machine Learning in PHP. Note that at the moment PHP is not the best choice for machine learning but maybe this will change ... + +Simple example of classification: +```php +use Phpml\Classifier\KNearestNeighbors; + +$samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]]; +$labels = ['a', 'a', 'a', 'b', 'b', 'b']; + +$classifier = new KNearestNeighbors(); +$classifier->train($samples, $labels); + +$classifier->predict([3, 2]); +// return 'b' +``` + +## Documentation + +To find out how to use PHP-ML follow [Documentation](http://php-ml.readthedocs.org/). ## Installation @@ -15,14 +34,33 @@ Currently this library is in the process of developing, but You can install it w composer require php-ai/php-ml ``` -## To-Do +## Features -* implements more algorithms -* integration with Lavacharts for data visualization +* Classification + * [k-Nearest Neighbors](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/k-nearest-neighbors/) + * [Naive Bayes](http://php-ml.readthedocs.io/en/latest/machine-learning/classification/naive-bayes/) +* Regression + * [Least Squares](http://php-ml.readthedocs.io/en/latest/machine-learning/regression/least-squares/) +* Clustering + * [k-Means](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/k-means) + * [DBSCAN](http://php-ml.readthedocs.io/en/latest/machine-learning/clustering/dbscan) +* Cross Validation + * [Random Split](http://php-ml.readthedocs.io/en/latest/machine-learning/cross-validation/random-split) +* Datasets + * [CSV](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/csv-dataset) + * Ready to use: + * [Iris](http://php-ml.readthedocs.io/en/latest/machine-learning/datasets/demo/iris/) +* Math + * [Distance](http://php-ml.readthedocs.io/en/latest/math/distance/) + * [Matrix](http://php-ml.readthedocs.io/en/latest/math/matrix/) + -## Testing +## Contribute -After installation, you can launch the test suite in project root directory (you will need to install dev requirements with composer) +- Issue Tracker: github.com/php-ai/php-ml/issues +- Source Code: github.com/php-ai/php-ml + +After installation, you can launch the test suite in project root directory (you will need to install dev requirements with Composer) ``` bin/phpunit @@ -34,4 +72,4 @@ PHP-ML is released under the MIT Licence. See the bundled LICENSE file for detai ## Author -Arkadiusz Kondas (@ArkadiuszKondas) \ No newline at end of file +Arkadiusz Kondas (@ArkadiuszKondas) diff --git a/docs/machine-learning/classification/knearestneighbors.md b/docs/machine-learning/classification/k-nearest-neighbors.md similarity index 79% rename from docs/machine-learning/classification/knearestneighbors.md rename to docs/machine-learning/classification/k-nearest-neighbors.md index 7d16828..3d5aa27 100644 --- a/docs/machine-learning/classification/knearestneighbors.md +++ b/docs/machine-learning/classification/k-nearest-neighbors.md @@ -5,7 +5,7 @@ Classifier implementing the k-nearest neighbors algorithm. ### Constructor Parameters * $k - number of nearest neighbors to scan (default: 3) -* $distanceMetric - Distance class, default Euclidean (see Distance Metric documentation) +* $distanceMetric - Distance object, default Euclidean (see [distance documentation](math/distance/)) ``` $classifier = new KNearestNeighbors($k=4); @@ -14,7 +14,7 @@ $classifier = new KNearestNeighbors($k=3, new Minkowski($lambda=4)); ### Train -To train a classifier simply provide train samples and labels (as `array`): +To train a classifier simply provide train samples and labels (as `array`). Example: ``` $samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]]; @@ -26,7 +26,7 @@ $classifier->train($samples, $labels); ### Predict -To predict sample class use `predict` method. You can provide one sample or array of samples: +To predict sample label use `predict` method. You can provide one sample or array of samples: ``` $classifier->predict([3, 2]); diff --git a/docs/machine-learning/classification/naivebayes.md b/docs/machine-learning/classification/naive-bayes.md similarity index 86% rename from docs/machine-learning/classification/naivebayes.md rename to docs/machine-learning/classification/naive-bayes.md index c700106..e990321 100644 --- a/docs/machine-learning/classification/naivebayes.md +++ b/docs/machine-learning/classification/naive-bayes.md @@ -4,7 +4,7 @@ Classifier based on applying Bayes' theorem with strong (naive) independence ass ### Train -To train a classifier simply provide train samples and labels (as `array`): +To train a classifier simply provide train samples and labels (as `array`). Example: ``` $samples = [[5, 1, 1], [1, 5, 1], [1, 1, 5]]; @@ -16,7 +16,7 @@ $classifier->train($samples, $labels); ### Predict -To predict sample class use `predict` method. You can provide one sample or array of samples: +To predict sample label use `predict` method. You can provide one sample or array of samples: ``` $classifier->predict([3, 1, 1]); diff --git a/docs/machine-learning/clustering/dbscan.md b/docs/machine-learning/clustering/dbscan.md new file mode 100644 index 0000000..45dd631 --- /dev/null +++ b/docs/machine-learning/clustering/dbscan.md @@ -0,0 +1,27 @@ +# DBSCAN clustering + +It is a density-based clustering algorithm: given a set of points in some space, it groups together points that are closely packed together (points with many nearby neighbors), marking as outliers points that lie alone in low-density regions (whose nearest neighbors are too far away). DBSCAN is one of the most common clustering algorithms and also most cited in scientific literature. +*(source: wikipedia)* + +### Constructor Parameters + +* $epsilon - epsilon, maximum distance between two samples for them to be considered as in the same neighborhood +* $minSamples - number of samples in a neighborhood for a point to be considered as a core point (this includes the point itself) +* $distanceMetric - Distance object, default Euclidean (see [distance documentation](math/distance/)) + +``` +$dbscan = new DBSCAN($epsilon = 2, $minSamples = 3); +$dbscan = new DBSCAN($epsilon = 2, $minSamples = 3, new Minkowski($lambda=4)); +``` + +### Clustering + +To divide the samples into clusters simply use `cluster` method. It's return the `array` of clusters with samples inside. + +``` +$samples = [[1, 1], [8, 7], [1, 2], [7, 8], [2, 1], [8, 9]]; + +$dbscan = new DBSCAN($epsilon = 2, $minSamples = 3); +$dbscan->cluster($samples); +// return [0=>[[1, 1], ...], 1=>[[8, 7], ...]] +``` diff --git a/docs/machine-learning/clustering/k-means.md b/docs/machine-learning/clustering/k-means.md new file mode 100644 index 0000000..296feb1 --- /dev/null +++ b/docs/machine-learning/clustering/k-means.md @@ -0,0 +1,37 @@ +# K-means clustering + +The K-Means algorithm clusters data by trying to separate samples in n groups of equal variance, minimizing a criterion known as the inertia or within-cluster sum-of-squares. +This algorithm requires the number of clusters to be specified. + +### Constructor Parameters + +* $clustersNumber - number of clusters to find +* $initialization - initialization method, default kmeans++ (see below) + +``` +$kmeans = new KMeans(2); +$kmeans = new KMeans(4, KMeans::INIT_RANDOM); +``` + +### Clustering + +To divide the samples into clusters simply use `cluster` method. It's return the `array` of clusters with samples inside. + +``` +$samples = [[1, 1], [8, 7], [1, 2], [7, 8], [2, 1], [8, 9]]; + +$kmeans = new KMeans(2); +$kmeans->cluster($samples); +// return [0=>[[1, 1], ...], 1=>[[8, 7], ...]] +``` + +### Initialization methods + +#### kmeans++ (default) + +K-means++ method selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. +It use the DASV seeding method consists of finding good initial centroids for the clusters. + +#### random + +Random initialization method chooses completely random centroid. It get the space boundaries to avoid placing clusters centroid too far from samples data. diff --git a/docs/machine-learning/cross-validation/randomsplit.md b/docs/machine-learning/cross-validation/random-split.md similarity index 100% rename from docs/machine-learning/cross-validation/randomsplit.md rename to docs/machine-learning/cross-validation/random-split.md diff --git a/docs/machine-learning/datasets/csv-dataset.md b/docs/machine-learning/datasets/csv-dataset.md index 553bc60..0ea6319 100644 --- a/docs/machine-learning/datasets/csv-dataset.md +++ b/docs/machine-learning/datasets/csv-dataset.md @@ -12,4 +12,4 @@ Helper class that loads data from CSV file. It extends the `ArrayDataset`. $dataset = new CsvDataset('dataset.csv', 2, true); ``` -See Array Dataset for more information. +See [ArrayDataset](machine-learning/datasets/array-dataset/) for more information. diff --git a/docs/machine-learning/datasets/demo/iris.md b/docs/machine-learning/datasets/demo/iris.md index 9e00d5c..5972f1b 100644 --- a/docs/machine-learning/datasets/demo/iris.md +++ b/docs/machine-learning/datasets/demo/iris.md @@ -17,7 +17,7 @@ To load Iris dataset simple use: $dataset = new Iris(); ``` -### Several samples +### Several samples example ``` sepal length,sepal width,petal length,petal width,class diff --git a/docs/machine-learning/metric/accuracy.md b/docs/machine-learning/metric/accuracy.md index b8ec70a..5045973 100644 --- a/docs/machine-learning/metric/accuracy.md +++ b/docs/machine-learning/metric/accuracy.md @@ -4,7 +4,7 @@ Class for calculate classifier accuracy. ### Score -To calculate classifier accuracy score use `score` static method. Parametrs: +To calculate classifier accuracy score use `score` static method. Parameters: * $actualLabels - (array) true sample labels * $predictedLabels - (array) predicted labels (e.x. from test group) diff --git a/docs/machine-learning/metric/distance/chebyshev.md b/docs/machine-learning/metric/distance/chebyshev.md deleted file mode 100644 index e4374b6..0000000 --- a/docs/machine-learning/metric/distance/chebyshev.md +++ /dev/null @@ -1,3 +0,0 @@ -# Chebyshev - -Class for calculation Chebyshev distance. diff --git a/docs/machine-learning/metric/distance/euclidean.md b/docs/machine-learning/metric/distance/euclidean.md deleted file mode 100644 index decc18b..0000000 --- a/docs/machine-learning/metric/distance/euclidean.md +++ /dev/null @@ -1,16 +0,0 @@ -# Euclidean - -Class for calculation Euclidean distance. - -![euclidean](https://upload.wikimedia.org/math/8/4/9/849f040fd10bb86f7c85eb0bbe3566a4.png "Euclidean Distance") - -To calculate distance: - -``` -$a = [4, 6]; -$b = [2, 5]; - -$euclidean = new Euclidean(); -$euclidean->distance($a, $b); -// return 2.2360679774998 -``` diff --git a/docs/machine-learning/metric/distance/manhattan.md b/docs/machine-learning/metric/distance/manhattan.md deleted file mode 100644 index a1502cc..0000000 --- a/docs/machine-learning/metric/distance/manhattan.md +++ /dev/null @@ -1,16 +0,0 @@ -# Manhattan - -Class for calculation Manhattan distance. - -![manhattan](https://upload.wikimedia.org/math/4/c/5/4c568bd1d76a6b15e19cb2ac3ad75350.png "Manhattan Distance") - -To calculate distance: - -``` -$a = [4, 6]; -$b = [2, 5]; - -$manhattan = new Manhattan(); -$manhattan->distance($a, $b); -// return 3 -``` diff --git a/docs/machine-learning/metric/distance/minkowski.md b/docs/machine-learning/metric/distance/minkowski.md deleted file mode 100644 index aac44b8..0000000 --- a/docs/machine-learning/metric/distance/minkowski.md +++ /dev/null @@ -1 +0,0 @@ -# Minkowski diff --git a/docs/machine-learning/regression/least-squares.md b/docs/machine-learning/regression/least-squares.md new file mode 100644 index 0000000..4a00bcd --- /dev/null +++ b/docs/machine-learning/regression/least-squares.md @@ -0,0 +1,51 @@ +# LeastSquares Linear Regression + +Linear model that use least squares method to approximate solution. + +### Train + +To train a model simply provide train samples and targets values (as `array`). Example: + +``` +$samples = [[60], [61], [62], [63], [65]]; +$targets = [3.1, 3.6, 3.8, 4, 4.1]; + +$regression = new LeastSquares(); +$regression->train($samples, $targets); +``` + +### Predict + +To predict sample target value use `predict` method with sample to check (as `array`). Example: + +``` +$regression->predict([64]); +// return 4.06 +``` + +### Multiple Linear Regression + +The term multiple attached to linear regression means that there are two or more sample parameters used to predict target. +For example you can use: mileage and production year to predict price of a car. + +``` +$samples = [[73676, 1996], [77006, 1998], [10565, 2000], [146088, 1995], [15000, 2001], [65940, 2000], [9300, 2000], [93739, 1996], [153260, 1994], [17764, 2002], [57000, 1998], [15000, 2000]]; +$targets = [2000, 2750, 15500, 960, 4400, 8800, 7100, 2550, 1025, 5900, 4600, 4400]; + +$regression = new LeastSquares(); +$regression->train($samples, $targets); +$regression->predict([60000, 1996]) +// return 4094.82 +``` + +### Intercept and Coefficients + +After you train your model you can get the intercept and coefficients array. + +``` +$regression->getIntercept(); +// return -7.9635135135131 + +$regression->getCoefficients(); +// return [array(1) {[0]=>float(0.18783783783783)}] +``` diff --git a/docs/math/distance.md b/docs/math/distance.md new file mode 100644 index 0000000..fd491ea --- /dev/null +++ b/docs/math/distance.md @@ -0,0 +1,109 @@ +# Distance + +Selected algorithms require the use of a function for calculating the distance. + +### Euclidean + +Class for calculation Euclidean distance. + +![euclidean](https://upload.wikimedia.org/math/8/4/9/849f040fd10bb86f7c85eb0bbe3566a4.png "Euclidean Distance") + +To calculate Euclidean distance: + +``` +$a = [4, 6]; +$b = [2, 5]; + +$euclidean = new Euclidean(); +$euclidean->distance($a, $b); +// return 2.2360679774998 +``` + +### Manhattan + +Class for calculation Manhattan distance. + +![manhattan](https://upload.wikimedia.org/math/4/c/5/4c568bd1d76a6b15e19cb2ac3ad75350.png "Manhattan Distance") + +To calculate Manhattan distance: + +``` +$a = [4, 6]; +$b = [2, 5]; + +$manhattan = new Manhattan(); +$manhattan->distance($a, $b); +// return 3 +``` + +### Chebyshev + +Class for calculation Chebyshev distance. + +![chebyshev](https://upload.wikimedia.org/math/7/1/2/71200f7dbb43b3bcfbcbdb9e02ab0a0c.png "Chebyshev Distance") + +To calculate Chebyshev distance: + +``` +$a = [4, 6]; +$b = [2, 5]; + +$chebyshev = new Chebyshev(); +$chebyshev->distance($a, $b); +// return 2 +``` + +### Minkowski + +Class for calculation Minkowski distance. + +![minkowski](https://upload.wikimedia.org/math/a/a/0/aa0c62083c12390cb15ac3217de88e66.png "Minkowski Distance") + +To calculate Minkowski distance: + +``` +$a = [4, 6]; +$b = [2, 5]; + +$minkowski = new Minkowski(); +$minkowski->distance($a, $b); +// return 2.080 +``` + +You can provide the `lambda` parameter: + +``` +$a = [6, 10, 3]; +$b = [2, 5, 5]; + +$minkowski = new Minkowski($lambda = 5); +$minkowski->distance($a, $b); +// return 5.300 +``` + +### Custom distance + +To apply your own function of distance use `Distance` interface. Example + +``` +class CustomDistance implements Distance +{ + /** + * @param array $a + * @param array $b + * + * @return float + */ + public function distance(array $a, array $b): float + { + $distance = []; + $count = count($a); + + for ($i = 0; $i < $count; ++$i) { + $distance[] = $a[$i] * $b[$i]; + } + + return min($distance); + } +} +``` diff --git a/docs/math/matrix.md b/docs/math/matrix.md new file mode 100644 index 0000000..3716347 --- /dev/null +++ b/docs/math/matrix.md @@ -0,0 +1,129 @@ +# Matrix + +Class that wraps PHP arrays to mathematical matrix. + +### Creation + +To create Matrix use simple arrays: + +``` +$matrix = new Matrix([ + [3, 3, 3], + [4, 2, 1], + [5, 6, 7], +]); +``` + +You can also create Matrix (one dimension) from flat array: + +``` +$flatArray = [1, 2, 3, 4]; +$matrix = Matrix::fromFlatArray($flatArray); +``` + +### Matrix data + +Methods for reading data from Matrix: + +``` +$matrix->toArray(); // cast matrix to PHP array +$matrix->getRows(); // rows count +$matrix->getColumns(); // columns count +$matrix->getColumnValues($column=4); // get values from given column +``` + +### Determinant + +Read more about [matrix determinant](https://en.wikipedia.org/wiki/Determinant). + +``` +$matrix = new Matrix([ + [3, 3, 3], + [4, 2, 1], + [5, 6, 7], +]); + +$matrix->getDeterminant(); +// return -3 +``` + +### Transpose + +Read more about [matrix transpose](https://en.wikipedia.org/wiki/Transpose). + +``` +$matrix->transpose(); +// return new Matrix +``` + +### Multiply + +Multiply Matrix by another Matrix. + +``` +$matrix1 = new Matrix([ + [1, 2, 3], + [4, 5, 6], +]); + +$matrix2 = new Matrix([ + [7, 8], + [9, 10], + [11, 12], +]); + +$matrix1->multiply($matrix2); + +// result $product = [ +// [58, 64], +// [139, 154], +//]; +``` + +### Divide by scalar + +You can divide Matrix by scalar value. + +``` +$matrix->divideByScalar(2); +``` + +### Inverse + +Read more about [invertible matrix](https://en.wikipedia.org/wiki/Invertible_matrix) + +``` +$matrix = new Matrix([ + [3, 4, 2], + [4, 5, 5], + [1, 1, 1], +]); + +$matrix->inverse(); + +// result $inverseMatrix = [ +// [0, -1, 5], +// [1 / 2, 1 / 2, -7 / 2], +// [-1 / 2, 1 / 2, -1 / 2], +//]; + +``` + +### Cross out + +Cross out given row and column from Matrix. + +``` +$matrix = new Matrix([ + [3, 4, 2], + [4, 5, 5], + [1, 1, 1], +]); + +$matrix->crossOut(1, 1) + +// result $crossOuted = [ +// [3, 2], +// [1, 1], +//]; +``` diff --git a/mkdocs.yml b/mkdocs.yml index 55d1914..eac1c17 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -3,20 +3,23 @@ pages: - Home: index.md - Machine Learning: - Classification: - - KNearestNeighbors: machine-learning/classification/knearestneighbors.md - - NaiveBayes: machine-learning/classification/naivebayes.md + - KNearestNeighbors: machine-learning/classification/k-nearest-neighbors.md + - NaiveBayes: machine-learning/classification/naive-bayes.md + - Regression: + - LeastSquares: machine-learning/regression/least-squares.md + - Clustering: + - KMeans: machine-learning/clustering/k-means.md + - DBSCAN: machine-learning/clustering/dbscan.md - Cross Validation: - - RandomSplit: machine-learning/cross-validation/randomsplit.md + - RandomSplit: machine-learning/cross-validation/random-split.md - Datasets: - Array Dataset: machine-learning/datasets/array-dataset.md - CSV Dataset: machine-learning/datasets/csv-dataset.md - - Demo: + - Ready to use datasets: - Iris: machine-learning/datasets/demo/iris.md - Metric: - Accuracy: machine-learning/metric/accuracy.md - - Distance: - - Euclidean: machine-learning/metric/distance/euclidean.md - - Chebyshev: machine-learning/metric/distance/chebyshev.md - - Manhattan: machine-learning/metric/distance/manhattan.md - - Minkowski: machine-learning/metric/distance/minkowski.md -theme: readthedocs \ No newline at end of file + - Math: + - Distance: math/distance.md + - Matrix: math/matrix.md +theme: readthedocs From abd3b38490f8b84297505abc8e35698ca26f81e8 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Mon, 2 May 2016 14:02:00 +0200 Subject: [PATCH 58/59] refactor kmeans variables names --- src/Phpml/Clustering/KMeans.php | 2 +- src/Phpml/Clustering/KMeans/Space.php | 68 ++++++--------------------- 2 files changed, 16 insertions(+), 54 deletions(-) diff --git a/src/Phpml/Clustering/KMeans.php b/src/Phpml/Clustering/KMeans.php index cdae3b5..c5372b0 100644 --- a/src/Phpml/Clustering/KMeans.php +++ b/src/Phpml/Clustering/KMeans.php @@ -51,7 +51,7 @@ class KMeans implements Clusterer } $clusters = []; - foreach ($space->solve($this->clustersNumber, $this->initialization) as $cluster) { + foreach ($space->cluster($this->clustersNumber, $this->initialization) as $cluster) { $clusters[] = $cluster->getPoints(); } diff --git a/src/Phpml/Clustering/KMeans/Space.php b/src/Phpml/Clustering/KMeans/Space.php index f4465cf..2976434 100644 --- a/src/Phpml/Clustering/KMeans/Space.php +++ b/src/Phpml/Clustering/KMeans/Space.php @@ -125,81 +125,51 @@ class Space extends SplObjectStorage } /** - * @param $nbClusters - * @param int $seed - * @param null $iterationCallback + * @param int $clustersNumber + * @param int $initMethod * * @return array|Cluster[] */ - public function solve($nbClusters, $seed = KMeans::INIT_RANDOM, $iterationCallback = null) + public function cluster(int $clustersNumber, int $initMethod = KMeans::INIT_RANDOM) { - if ($iterationCallback && !is_callable($iterationCallback)) { - throw new InvalidArgumentException('invalid iteration callback'); - } + $clusters = $this->initializeClusters($clustersNumber, $initMethod); - // initialize K clusters - $clusters = $this->initializeClusters($nbClusters, $seed); - - // there's only one cluster, clusterization has no meaning - if (count($clusters) == 1) { - return $clusters[0]; - } - - // until convergence is reached do { - $iterationCallback && $iterationCallback($this, $clusters); - } while ($this->iterate($clusters)); + } while (!$this->iterate($clusters)); - // clustering is done. return $clusters; } /** - * @param $nbClusters - * @param $seed + * @param $clustersNumber + * @param $initMethod * - * @return array + * @return array|Cluster[] */ - protected function initializeClusters($nbClusters, $seed) + protected function initializeClusters(int $clustersNumber, int $initMethod) { - if ($nbClusters <= 0) { - throw new InvalidArgumentException('invalid clusters number'); - } - - switch ($seed) { - // the default seeding method chooses completely random centroid + switch ($initMethod) { case KMeans::INIT_RANDOM: - // get the space boundaries to avoid placing clusters centroid too far from points list($min, $max) = $this->getBoundaries(); - - // initialize N clusters with a random point within space boundaries - for ($n = 0; $n < $nbClusters; ++$n) { + for ($n = 0; $n < $clustersNumber; ++$n) { $clusters[] = new Cluster($this, $this->getRandomPoint($min, $max)->getCoordinates()); } - break; - // the DASV seeding method consists of finding good initial centroids for the clusters case KMeans::INIT_KMEANS_PLUS_PLUS: - // find a random point $position = rand(1, count($this)); for ($i = 1, $this->rewind(); $i < $position && $this->valid(); $i++, $this->next()); $clusters[] = new Cluster($this, $this->current()->getCoordinates()); - // retains the distances between points and their closest clusters $distances = new SplObjectStorage(); - // create k clusters - for ($i = 1; $i < $nbClusters; ++$i) { + for ($i = 1; $i < $clustersNumber; ++$i) { $sum = 0; - - // for each points, get the distance with the closest centroid already choosen foreach ($this as $point) { $distance = $point->getDistanceWith($point->getClosest($clusters)); $sum += $distances[$point] = $distance; } - // choose a new random point using a weighted probability distribution $sum = rand(0, (int) $sum); foreach ($this as $point) { if (($sum -= $distances[$point]) > 0) { @@ -213,8 +183,6 @@ class Space extends SplObjectStorage break; } - - // assing all points to the first cluster $clusters[0]->attachAll($this); return $clusters; @@ -227,19 +195,15 @@ class Space extends SplObjectStorage */ protected function iterate($clusters) { - $continue = false; + $convergence = true; - // migration storages $attach = new SplObjectStorage(); $detach = new SplObjectStorage(); - // calculate proximity amongst points and clusters foreach ($clusters as $cluster) { foreach ($cluster as $point) { - // find the closest cluster $closest = $point->getClosest($clusters); - // move the point from its old cluster to its closest if ($closest !== $cluster) { isset($attach[$closest]) || $attach[$closest] = new SplObjectStorage(); isset($detach[$cluster]) || $detach[$cluster] = new SplObjectStorage(); @@ -247,12 +211,11 @@ class Space extends SplObjectStorage $attach[$closest]->attach($point); $detach[$cluster]->attach($point); - $continue = true; + $convergence = false; } } } - // perform points migrations foreach ($attach as $cluster) { $cluster->attachAll($attach[$cluster]); } @@ -261,11 +224,10 @@ class Space extends SplObjectStorage $cluster->detachAll($detach[$cluster]); } - // update all cluster's centroids foreach ($clusters as $cluster) { $cluster->updateCentroid(); } - return $continue; + return $convergence; } } From 3fd5abf566362539fb01ff6c584234d324315e36 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kondas Date: Mon, 2 May 2016 14:10:49 +0200 Subject: [PATCH 59/59] fix typo in code example --- README.md | 2 +- docs/index.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 71f7570..d3f65b7 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Fresh approach to Machine Learning in PHP. Note that at the moment PHP is not th Simple example of classification: ```php -use Phpml\Classifier\KNearestNeighbors; +use Phpml\Classification\KNearestNeighbors; $samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]]; $labels = ['a', 'a', 'a', 'b', 'b', 'b']; diff --git a/docs/index.md b/docs/index.md index 71f7570..d3f65b7 100644 --- a/docs/index.md +++ b/docs/index.md @@ -10,7 +10,7 @@ Fresh approach to Machine Learning in PHP. Note that at the moment PHP is not th Simple example of classification: ```php -use Phpml\Classifier\KNearestNeighbors; +use Phpml\Classification\KNearestNeighbors; $samples = [[1, 3], [1, 4], [2, 4], [3, 1], [4, 1], [4, 2]]; $labels = ['a', 'a', 'a', 'b', 'b', 'b'];