2018-02-24 10:17:35 +00:00
|
|
|
<?php
|
|
|
|
|
|
|
|
declare(strict_types=1);
|
|
|
|
|
|
|
|
namespace Phpml\Dataset;
|
|
|
|
|
|
|
|
use Phpml\Exception\DatasetException;
|
|
|
|
use Phpml\Exception\FileException;
|
|
|
|
|
|
|
|
class SvmDataset extends ArrayDataset
|
|
|
|
{
|
|
|
|
public function __construct(string $filePath)
|
|
|
|
{
|
|
|
|
[$samples, $targets] = self::readProblem($filePath);
|
|
|
|
|
|
|
|
parent::__construct($samples, $targets);
|
|
|
|
}
|
|
|
|
|
|
|
|
private static function readProblem(string $filePath): array
|
|
|
|
{
|
|
|
|
$handle = self::openFile($filePath);
|
|
|
|
|
|
|
|
$samples = [];
|
|
|
|
$targets = [];
|
|
|
|
$maxIndex = 0;
|
2018-10-28 06:44:52 +00:00
|
|
|
while (false !== $line = fgets($handle)) {
|
|
|
|
[$sample, $target, $maxIndex] = self::processLine((string) $line, $maxIndex);
|
2018-02-24 10:17:35 +00:00
|
|
|
$samples[] = $sample;
|
|
|
|
$targets[] = $target;
|
|
|
|
}
|
|
|
|
|
|
|
|
fclose($handle);
|
|
|
|
|
|
|
|
foreach ($samples as &$sample) {
|
|
|
|
$sample = array_pad($sample, $maxIndex + 1, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
return [$samples, $targets];
|
|
|
|
}
|
|
|
|
|
2018-10-28 06:44:52 +00:00
|
|
|
/**
|
|
|
|
* @return resource
|
|
|
|
*/
|
2018-02-24 10:17:35 +00:00
|
|
|
private static function openFile(string $filePath)
|
|
|
|
{
|
|
|
|
if (!file_exists($filePath)) {
|
2018-03-03 15:03:53 +00:00
|
|
|
throw new FileException(sprintf('File "%s" missing.', basename($filePath)));
|
2018-02-24 10:17:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
$handle = fopen($filePath, 'rb');
|
|
|
|
if ($handle === false) {
|
2018-03-03 15:03:53 +00:00
|
|
|
throw new FileException(sprintf('File "%s" can\'t be open.', basename($filePath)));
|
2018-02-24 10:17:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return $handle;
|
|
|
|
}
|
|
|
|
|
|
|
|
private static function processLine(string $line, int $maxIndex): array
|
|
|
|
{
|
|
|
|
$columns = self::parseLine($line);
|
|
|
|
|
|
|
|
$target = self::parseTargetColumn($columns[0]);
|
|
|
|
$sample = array_fill(0, $maxIndex + 1, 0);
|
|
|
|
|
|
|
|
$n = count($columns);
|
|
|
|
for ($i = 1; $i < $n; ++$i) {
|
|
|
|
[$index, $value] = self::parseFeatureColumn($columns[$i]);
|
|
|
|
if ($index > $maxIndex) {
|
|
|
|
$maxIndex = $index;
|
|
|
|
$sample = array_pad($sample, $maxIndex + 1, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
$sample[$index] = $value;
|
|
|
|
}
|
|
|
|
|
|
|
|
return [$sample, $target, $maxIndex];
|
|
|
|
}
|
|
|
|
|
|
|
|
private static function parseLine(string $line): array
|
|
|
|
{
|
|
|
|
$line = explode('#', $line, 2)[0];
|
|
|
|
$line = rtrim($line);
|
|
|
|
$line = str_replace("\t", ' ', $line);
|
|
|
|
|
2018-06-15 05:57:45 +00:00
|
|
|
return explode(' ', $line);
|
2018-02-24 10:17:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
private static function parseTargetColumn(string $column): float
|
|
|
|
{
|
|
|
|
if (!is_numeric($column)) {
|
2018-03-03 15:03:53 +00:00
|
|
|
throw new DatasetException(sprintf('Invalid target "%s".', $column));
|
2018-02-24 10:17:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return (float) $column;
|
|
|
|
}
|
|
|
|
|
|
|
|
private static function parseFeatureColumn(string $column): array
|
|
|
|
{
|
|
|
|
$feature = explode(':', $column, 2);
|
2018-10-16 19:42:06 +00:00
|
|
|
if (count($feature) !== 2) {
|
2018-03-03 15:03:53 +00:00
|
|
|
throw new DatasetException(sprintf('Invalid value "%s".', $column));
|
2018-02-24 10:17:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
$index = self::parseFeatureIndex($feature[0]);
|
|
|
|
$value = self::parseFeatureValue($feature[1]);
|
|
|
|
|
|
|
|
return [$index, $value];
|
|
|
|
}
|
|
|
|
|
|
|
|
private static function parseFeatureIndex(string $index): int
|
|
|
|
{
|
|
|
|
if (!is_numeric($index) || !ctype_digit($index)) {
|
2018-03-03 15:03:53 +00:00
|
|
|
throw new DatasetException(sprintf('Invalid index "%s".', $index));
|
2018-02-24 10:17:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if ((int) $index < 1) {
|
2018-03-03 15:03:53 +00:00
|
|
|
throw new DatasetException(sprintf('Invalid index "%s".', $index));
|
2018-02-24 10:17:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return (int) $index - 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
private static function parseFeatureValue(string $value): float
|
|
|
|
{
|
|
|
|
if (!is_numeric($value)) {
|
2018-03-03 15:03:53 +00:00
|
|
|
throw new DatasetException(sprintf('Invalid value "%s".', $value));
|
2018-02-24 10:17:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return (float) $value;
|
|
|
|
}
|
|
|
|
}
|