first commit

This commit is contained in:
CHIEFSOFT\ameye
2024-09-30 18:11:26 -04:00
commit e592ca6823
27270 changed files with 5002257 additions and 0 deletions
+21
View File
@@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2016-2020 Arkadiusz Kondas <arkadiusz.kondas[at]gmail>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
+10
View File
@@ -0,0 +1,10 @@
Current version is 0.10.0
# Download latest stable version from https://gitlab.com/php-ai/php-ml
# Remove all files but:
* src/
* LICENSE
# Copy content of src/ to /path/to/moodle/lib/mlbackend/php/phpml/src/Phpml
# Copy LICENSE file to /path/to/moodle/lib/mlbackend/php/phpml
# Applied patch https://github.com/jorgecasas/php-ml/pull/5
@@ -0,0 +1,332 @@
<?php
declare(strict_types=1);
namespace Phpml\Association;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
class Apriori implements Associator
{
use Trainable;
use Predictable;
public const ARRAY_KEY_ANTECEDENT = 'antecedent';
public const ARRAY_KEY_CONFIDENCE = 'confidence';
public const ARRAY_KEY_CONSEQUENT = 'consequent';
public const ARRAY_KEY_SUPPORT = 'support';
/**
* Minimum relative probability of frequent transactions.
*
* @var float
*/
private $confidence;
/**
* The large set contains frequent k-length item sets.
*
* @var mixed[][][]
*/
private $large = [];
/**
* Minimum relative frequency of transactions.
*
* @var float
*/
private $support;
/**
* The generated Apriori association rules.
*
* @var mixed[][]
*/
private $rules = [];
/**
* Apriori constructor.
*/
public function __construct(float $support = 0.0, float $confidence = 0.0)
{
$this->support = $support;
$this->confidence = $confidence;
}
/**
* Get all association rules which are generated for every k-length frequent item set.
*
* @return mixed[][]
*/
public function getRules(): array
{
if (count($this->large) === 0) {
$this->large = $this->apriori();
}
if (count($this->rules) > 0) {
return $this->rules;
}
$this->rules = [];
$this->generateAllRules();
return $this->rules;
}
/**
* Generates frequent item sets.
*
* @return mixed[][][]
*/
public function apriori(): array
{
$L = [];
$items = $this->frequent($this->items());
for ($k = 1; isset($items[0]); ++$k) {
$L[$k] = $items;
$items = $this->frequent($this->candidates($items));
}
return $L;
}
/**
* @param mixed[] $sample
*
* @return mixed[][]
*/
protected function predictSample(array $sample): array
{
$predicts = array_values(array_filter($this->getRules(), function ($rule) use ($sample): bool {
return $this->equals($rule[self::ARRAY_KEY_ANTECEDENT], $sample);
}));
return array_map(static function ($rule) {
return $rule[self::ARRAY_KEY_CONSEQUENT];
}, $predicts);
}
/**
* Generate rules for each k-length frequent item set.
*/
private function generateAllRules(): void
{
for ($k = 2; isset($this->large[$k]); ++$k) {
foreach ($this->large[$k] as $frequent) {
$this->generateRules($frequent);
}
}
}
/**
* Generate confident rules for frequent item set.
*
* @param mixed[] $frequent
*/
private function generateRules(array $frequent): void
{
foreach ($this->antecedents($frequent) as $antecedent) {
$confidence = $this->confidence($frequent, $antecedent);
if ($this->confidence <= $confidence) {
$consequent = array_values(array_diff($frequent, $antecedent));
$this->rules[] = [
self::ARRAY_KEY_ANTECEDENT => $antecedent,
self::ARRAY_KEY_CONSEQUENT => $consequent,
self::ARRAY_KEY_SUPPORT => $this->support($frequent),
self::ARRAY_KEY_CONFIDENCE => $confidence,
];
}
}
}
/**
* Generates the power set for given item set $sample.
*
* @param mixed[] $sample
*
* @return mixed[][]
*/
private function powerSet(array $sample): array
{
$results = [[]];
foreach ($sample as $item) {
foreach ($results as $combination) {
$results[] = array_merge([$item], $combination);
}
}
return $results;
}
/**
* Generates all proper subsets for given set $sample without the empty set.
*
* @param mixed[] $sample
*
* @return mixed[][]
*/
private function antecedents(array $sample): array
{
$cardinality = count($sample);
$antecedents = $this->powerSet($sample);
return array_filter($antecedents, static function ($antecedent) use ($cardinality): bool {
return (count($antecedent) != $cardinality) && ($antecedent != []);
});
}
/**
* Calculates frequent k = 1 item sets.
*
* @return mixed[][]
*/
private function items(): array
{
$items = [];
foreach ($this->samples as $sample) {
foreach ($sample as $item) {
if (!in_array($item, $items, true)) {
$items[] = $item;
}
}
}
return array_map(static function ($entry): array {
return [$entry];
}, $items);
}
/**
* Returns frequent item sets only.
*
* @param mixed[][] $samples
*
* @return mixed[][]
*/
private function frequent(array $samples): array
{
return array_values(array_filter($samples, function ($entry): bool {
return $this->support($entry) >= $this->support;
}));
}
/**
* Calculates frequent k item sets, where count($samples) == $k - 1.
*
* @param mixed[][] $samples
*
* @return mixed[][]
*/
private function candidates(array $samples): array
{
$candidates = [];
foreach ($samples as $p) {
foreach ($samples as $q) {
if (count(array_merge(array_diff($p, $q), array_diff($q, $p))) != 2) {
continue;
}
$candidate = array_values(array_unique(array_merge($p, $q)));
if ($this->contains($candidates, $candidate)) {
continue;
}
foreach ($this->samples as $sample) {
if ($this->subset($sample, $candidate)) {
$candidates[] = $candidate;
continue 2;
}
}
}
}
return $candidates;
}
/**
* Calculates confidence for $set. Confidence is the relative amount of sets containing $subset which also contain
* $set.
*
* @param mixed[] $set
* @param mixed[] $subset
*/
private function confidence(array $set, array $subset): float
{
return $this->support($set) / $this->support($subset);
}
/**
* Calculates support for item set $sample. Support is the relative amount of sets containing $sample in the data
* pool.
*
* @see \Phpml\Association\Apriori::samples
*
* @param mixed[] $sample
*/
private function support(array $sample): float
{
return $this->frequency($sample) / count($this->samples);
}
/**
* Counts occurrences of $sample as subset in data pool.
*
* @see \Phpml\Association\Apriori::samples
*
* @param mixed[] $sample
*/
private function frequency(array $sample): int
{
return count(array_filter($this->samples, function ($entry) use ($sample): bool {
return $this->subset($entry, $sample);
}));
}
/**
* Returns true if set is an element of system.
*
* @see \Phpml\Association\Apriori::equals()
*
* @param mixed[][] $system
* @param mixed[] $set
*/
private function contains(array $system, array $set): bool
{
return (bool) array_filter($system, function ($entry) use ($set): bool {
return $this->equals($entry, $set);
});
}
/**
* Returns true if subset is a (proper) subset of set by its items string representation.
*
* @param mixed[] $set
* @param mixed[] $subset
*/
private function subset(array $set, array $subset): bool
{
return count(array_diff($subset, array_intersect($subset, $set))) === 0;
}
/**
* Returns true if string representation of items does not differ.
*
* @param mixed[] $set1
* @param mixed[] $set2
*/
private function equals(array $set1, array $set2): bool
{
return array_diff($set1, $set2) == array_diff($set2, $set1);
}
}
@@ -0,0 +1,11 @@
<?php
declare(strict_types=1);
namespace Phpml\Association;
use Phpml\Estimator;
interface Associator extends Estimator
{
}
@@ -0,0 +1,11 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification;
use Phpml\Estimator;
interface Classifier extends Estimator
{
}
@@ -0,0 +1,484 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification;
use Phpml\Classification\DecisionTree\DecisionTreeLeaf;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
use Phpml\Math\Statistic\Mean;
class DecisionTree implements Classifier
{
use Trainable;
use Predictable;
public const CONTINUOUS = 1;
public const NOMINAL = 2;
/**
* @var int
*/
public $actualDepth = 0;
/**
* @var array
*/
protected $columnTypes = [];
/**
* @var DecisionTreeLeaf
*/
protected $tree;
/**
* @var int
*/
protected $maxDepth;
/**
* @var array
*/
private $labels = [];
/**
* @var int
*/
private $featureCount = 0;
/**
* @var int
*/
private $numUsableFeatures = 0;
/**
* @var array
*/
private $selectedFeatures = [];
/**
* @var array|null
*/
private $featureImportances;
/**
* @var array
*/
private $columnNames = [];
public function __construct(int $maxDepth = 10)
{
$this->maxDepth = $maxDepth;
}
public function train(array $samples, array $targets): void
{
$this->samples = array_merge($this->samples, $samples);
$this->targets = array_merge($this->targets, $targets);
$this->featureCount = count($this->samples[0]);
$this->columnTypes = self::getColumnTypes($this->samples);
$this->labels = array_keys(array_count_values($this->targets));
$this->tree = $this->getSplitLeaf(range(0, count($this->samples) - 1));
// Each time the tree is trained, feature importances are reset so that
// we will have to compute it again depending on the new data
$this->featureImportances = null;
// If column names are given or computed before, then there is no
// need to init it and accidentally remove the previous given names
if ($this->columnNames === []) {
$this->columnNames = range(0, $this->featureCount - 1);
} elseif (count($this->columnNames) > $this->featureCount) {
$this->columnNames = array_slice($this->columnNames, 0, $this->featureCount);
} elseif (count($this->columnNames) < $this->featureCount) {
$this->columnNames = array_merge(
$this->columnNames,
range(count($this->columnNames), $this->featureCount - 1)
);
}
}
public static function getColumnTypes(array $samples): array
{
$types = [];
$featureCount = count($samples[0]);
for ($i = 0; $i < $featureCount; ++$i) {
$values = array_column($samples, $i);
$isCategorical = self::isCategoricalColumn($values);
$types[] = $isCategorical ? self::NOMINAL : self::CONTINUOUS;
}
return $types;
}
/**
* @param mixed $baseValue
*/
public function getGiniIndex($baseValue, array $colValues, array $targets): float
{
$countMatrix = [];
foreach ($this->labels as $label) {
$countMatrix[$label] = [0, 0];
}
foreach ($colValues as $index => $value) {
$label = $targets[$index];
$rowIndex = $value === $baseValue ? 0 : 1;
++$countMatrix[$label][$rowIndex];
}
$giniParts = [0, 0];
for ($i = 0; $i <= 1; ++$i) {
$part = 0;
$sum = array_sum(array_column($countMatrix, $i));
if ($sum > 0) {
foreach ($this->labels as $label) {
$part += ($countMatrix[$label][$i] / (float) $sum) ** 2;
}
}
$giniParts[$i] = (1 - $part) * $sum;
}
return array_sum($giniParts) / count($colValues);
}
/**
* This method is used to set number of columns to be used
* when deciding a split at an internal node of the tree. <br>
* If the value is given 0, then all features are used (default behaviour),
* otherwise the given value will be used as a maximum for number of columns
* randomly selected for each split operation.
*
* @return $this
*
* @throws InvalidArgumentException
*/
public function setNumFeatures(int $numFeatures)
{
if ($numFeatures < 0) {
throw new InvalidArgumentException('Selected column count should be greater or equal to zero');
}
$this->numUsableFeatures = $numFeatures;
return $this;
}
/**
* A string array to represent columns. Useful when HTML output or
* column importances are desired to be inspected.
*
* @return $this
*
* @throws InvalidArgumentException
*/
public function setColumnNames(array $names)
{
if ($this->featureCount !== 0 && count($names) !== $this->featureCount) {
throw new InvalidArgumentException(sprintf('Length of the given array should be equal to feature count %s', $this->featureCount));
}
$this->columnNames = $names;
return $this;
}
public function getHtml(): string
{
return $this->tree->getHTML($this->columnNames);
}
/**
* This will return an array including an importance value for
* each column in the given dataset. The importance values are
* normalized and their total makes 1.<br/>
*/
public function getFeatureImportances(): array
{
if ($this->featureImportances !== null) {
return $this->featureImportances;
}
$sampleCount = count($this->samples);
$this->featureImportances = [];
foreach ($this->columnNames as $column => $columnName) {
$nodes = $this->getSplitNodesByColumn($column, $this->tree);
$importance = 0;
foreach ($nodes as $node) {
$importance += $node->getNodeImpurityDecrease($sampleCount);
}
$this->featureImportances[$columnName] = $importance;
}
// Normalize & sort the importances
$total = array_sum($this->featureImportances);
if ($total > 0) {
array_walk($this->featureImportances, function (&$importance) use ($total): void {
$importance /= $total;
});
arsort($this->featureImportances);
}
return $this->featureImportances;
}
protected function getSplitLeaf(array $records, int $depth = 0): DecisionTreeLeaf
{
$split = $this->getBestSplit($records);
$split->level = $depth;
if ($this->actualDepth < $depth) {
$this->actualDepth = $depth;
}
// Traverse all records to see if all records belong to the same class,
// otherwise group the records so that we can classify the leaf
// in case maximum depth is reached
$leftRecords = [];
$rightRecords = [];
$remainingTargets = [];
$prevRecord = null;
$allSame = true;
foreach ($records as $recordNo) {
// Check if the previous record is the same with the current one
$record = $this->samples[$recordNo];
if ($prevRecord !== null && $prevRecord != $record) {
$allSame = false;
}
$prevRecord = $record;
// According to the split criteron, this record will
// belong to either left or the right side in the next split
if ($split->evaluate($record)) {
$leftRecords[] = $recordNo;
} else {
$rightRecords[] = $recordNo;
}
// Group remaining targets
$target = $this->targets[$recordNo];
if (!array_key_exists($target, $remainingTargets)) {
$remainingTargets[$target] = 1;
} else {
++$remainingTargets[$target];
}
}
if ($allSame || $depth >= $this->maxDepth || count($remainingTargets) === 1) {
$split->isTerminal = true;
arsort($remainingTargets);
$split->classValue = (string) key($remainingTargets);
} else {
if (isset($leftRecords[0])) {
$split->leftLeaf = $this->getSplitLeaf($leftRecords, $depth + 1);
}
if (isset($rightRecords[0])) {
$split->rightLeaf = $this->getSplitLeaf($rightRecords, $depth + 1);
}
}
return $split;
}
protected function getBestSplit(array $records): DecisionTreeLeaf
{
$targets = array_intersect_key($this->targets, array_flip($records));
$samples = (array) array_combine(
$records,
$this->preprocess(array_intersect_key($this->samples, array_flip($records)))
);
$bestGiniVal = 1;
$bestSplit = null;
$features = $this->getSelectedFeatures();
foreach ($features as $i) {
$colValues = [];
foreach ($samples as $index => $row) {
$colValues[$index] = $row[$i];
}
$counts = array_count_values($colValues);
arsort($counts);
$baseValue = key($counts);
if ($baseValue === null) {
continue;
}
$gini = $this->getGiniIndex($baseValue, $colValues, $targets);
if ($bestSplit === null || $bestGiniVal > $gini) {
$split = new DecisionTreeLeaf();
$split->value = $baseValue;
$split->giniIndex = $gini;
$split->columnIndex = $i;
$split->isContinuous = $this->columnTypes[$i] === self::CONTINUOUS;
$split->records = $records;
// If a numeric column is to be selected, then
// the original numeric value and the selected operator
// will also be saved into the leaf for future access
if ($this->columnTypes[$i] === self::CONTINUOUS) {
$matches = [];
preg_match("/^([<>=]{1,2})\s*(.*)/", (string) $split->value, $matches);
$split->operator = $matches[1];
$split->numericValue = (float) $matches[2];
}
$bestSplit = $split;
$bestGiniVal = $gini;
}
}
return $bestSplit;
}
/**
* Returns available features/columns to the tree for the decision making
* process. <br>
*
* If a number is given with setNumFeatures() method, then a random selection
* of features up to this number is returned. <br>
*
* If some features are manually selected by use of setSelectedFeatures(),
* then only these features are returned <br>
*
* If any of above methods were not called beforehand, then all features
* are returned by default.
*/
protected function getSelectedFeatures(): array
{
$allFeatures = range(0, $this->featureCount - 1);
if ($this->numUsableFeatures === 0 && count($this->selectedFeatures) === 0) {
return $allFeatures;
}
if (count($this->selectedFeatures) > 0) {
return $this->selectedFeatures;
}
$numFeatures = $this->numUsableFeatures;
if ($numFeatures > $this->featureCount) {
$numFeatures = $this->featureCount;
}
shuffle($allFeatures);
$selectedFeatures = array_slice($allFeatures, 0, $numFeatures);
sort($selectedFeatures);
return $selectedFeatures;
}
protected function preprocess(array $samples): array
{
// Detect and convert continuous data column values into
// discrete values by using the median as a threshold value
$columns = [];
for ($i = 0; $i < $this->featureCount; ++$i) {
$values = array_column($samples, $i);
if ($this->columnTypes[$i] == self::CONTINUOUS) {
$median = Mean::median($values);
foreach ($values as &$value) {
if ($value <= $median) {
$value = "<= {$median}";
} else {
$value = "> {$median}";
}
}
}
$columns[] = $values;
}
// Below method is a strange yet very simple & efficient method
// to get the transpose of a 2D array
return array_map(null, ...$columns);
}
protected static function isCategoricalColumn(array $columnValues): bool
{
$count = count($columnValues);
// There are two main indicators that *may* show whether a
// column is composed of discrete set of values:
// 1- Column may contain string values and non-float values
// 2- Number of unique values in the column is only a small fraction of
// all values in that column (Lower than or equal to %20 of all values)
$numericValues = array_filter($columnValues, 'is_numeric');
$floatValues = array_filter($columnValues, 'is_float');
if (count($floatValues) > 0) {
return false;
}
if (count($numericValues) !== $count) {
return true;
}
$distinctValues = array_count_values($columnValues);
return count($distinctValues) <= $count / 5;
}
/**
* Used to set predefined features to consider while deciding which column to use for a split
*/
protected function setSelectedFeatures(array $selectedFeatures): void
{
$this->selectedFeatures = $selectedFeatures;
}
/**
* Collects and returns an array of internal nodes that use the given
* column as a split criterion
*/
protected function getSplitNodesByColumn(int $column, DecisionTreeLeaf $node): array
{
if ($node->isTerminal) {
return [];
}
$nodes = [];
if ($node->columnIndex === $column) {
$nodes[] = $node;
}
$lNodes = [];
$rNodes = [];
if ($node->leftLeaf !== null) {
$lNodes = $this->getSplitNodesByColumn($column, $node->leftLeaf);
}
if ($node->rightLeaf !== null) {
$rNodes = $this->getSplitNodesByColumn($column, $node->rightLeaf);
}
return array_merge($nodes, $lNodes, $rNodes);
}
/**
* @return mixed
*/
protected function predictSample(array $sample)
{
$node = $this->tree;
do {
if ($node->isTerminal) {
return $node->classValue;
}
if ($node->evaluate($sample)) {
$node = $node->leftLeaf;
} else {
$node = $node->rightLeaf;
}
} while ($node);
return $this->labels[0];
}
}
@@ -0,0 +1,165 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification\DecisionTree;
use Phpml\Math\Comparison;
class DecisionTreeLeaf
{
/**
* @var string|int
*/
public $value;
/**
* @var float
*/
public $numericValue;
/**
* @var string
*/
public $operator;
/**
* @var int
*/
public $columnIndex;
/**
* @var DecisionTreeLeaf|null
*/
public $leftLeaf;
/**
* @var DecisionTreeLeaf|null
*/
public $rightLeaf;
/**
* @var array
*/
public $records = [];
/**
* Class value represented by the leaf, this value is non-empty
* only for terminal leaves
*
* @var string
*/
public $classValue = '';
/**
* @var bool
*/
public $isTerminal = false;
/**
* @var bool
*/
public $isContinuous = false;
/**
* @var float
*/
public $giniIndex = 0;
/**
* @var int
*/
public $level = 0;
/**
* HTML representation of the tree without column names
*/
public function __toString(): string
{
return $this->getHTML();
}
public function evaluate(array $record): bool
{
$recordField = $record[$this->columnIndex];
if ($this->isContinuous) {
return Comparison::compare((string) $recordField, $this->numericValue, $this->operator);
}
return $recordField == $this->value;
}
/**
* Returns Mean Decrease Impurity (MDI) in the node.
* For terminal nodes, this value is equal to 0
*/
public function getNodeImpurityDecrease(int $parentRecordCount): float
{
if ($this->isTerminal) {
return 0.0;
}
$nodeSampleCount = (float) count($this->records);
$iT = $this->giniIndex;
if ($this->leftLeaf !== null) {
$pL = count($this->leftLeaf->records) / $nodeSampleCount;
$iT -= $pL * $this->leftLeaf->giniIndex;
}
if ($this->rightLeaf !== null) {
$pR = count($this->rightLeaf->records) / $nodeSampleCount;
$iT -= $pR * $this->rightLeaf->giniIndex;
}
return $iT * $nodeSampleCount / $parentRecordCount;
}
/**
* Returns HTML representation of the node including children nodes
*/
public function getHTML(?array $columnNames = null): string
{
if ($this->isTerminal) {
$value = "<b>{$this}->classValue</b>";
} else {
$value = $this->value;
if ($columnNames !== null) {
$col = $columnNames[$this->columnIndex];
} else {
$col = "col_$this->columnIndex";
}
if ((bool) preg_match('/^[<>=]{1,2}/', (string) $value) === false) {
$value = "={$value}";
}
$value = "<b>{$col} {$value}</b><br>Gini: ".number_format($this->giniIndex, 2);
}
$str = "<table ><tr><td colspan=3 align=center style='border:1px solid;'>{$value}</td></tr>";
if ($this->leftLeaf !== null || $this->rightLeaf !== null) {
$str .= '<tr>';
if ($this->leftLeaf !== null) {
$str .= '<td valign=top><b>| Yes</b><br>'.$this->leftLeaf->getHTML($columnNames).'</td>';
} else {
$str .= '<td></td>';
}
$str .= '<td>&nbsp;</td>';
if ($this->rightLeaf !== null) {
$str .= '<td valign=top align=right><b>No |</b><br>'.$this->rightLeaf->getHTML($columnNames).'</td>';
} else {
$str .= '<td></td>';
}
$str .= '</tr>';
}
$str .= '</table>';
return $str;
}
}
@@ -0,0 +1,252 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification\Ensemble;
use Phpml\Classification\Classifier;
use Phpml\Classification\Linear\DecisionStump;
use Phpml\Classification\WeightedClassifier;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
use Phpml\Math\Statistic\Mean;
use Phpml\Math\Statistic\StandardDeviation;
use ReflectionClass;
class AdaBoost implements Classifier
{
use Predictable;
use Trainable;
/**
* Actual labels given in the targets array
*
* @var array
*/
protected $labels = [];
/**
* @var int
*/
protected $sampleCount;
/**
* @var int
*/
protected $featureCount;
/**
* Number of maximum iterations to be done
*
* @var int
*/
protected $maxIterations;
/**
* Sample weights
*
* @var array
*/
protected $weights = [];
/**
* List of selected 'weak' classifiers
*
* @var array
*/
protected $classifiers = [];
/**
* Base classifier weights
*
* @var array
*/
protected $alpha = [];
/**
* @var string
*/
protected $baseClassifier = DecisionStump::class;
/**
* @var array
*/
protected $classifierOptions = [];
/**
* ADAptive BOOSTing (AdaBoost) is an ensemble algorithm to
* improve classification performance of 'weak' classifiers such as
* DecisionStump (default base classifier of AdaBoost).
*/
public function __construct(int $maxIterations = 50)
{
$this->maxIterations = $maxIterations;
}
/**
* Sets the base classifier that will be used for boosting (default = DecisionStump)
*/
public function setBaseClassifier(string $baseClassifier = DecisionStump::class, array $classifierOptions = []): void
{
$this->baseClassifier = $baseClassifier;
$this->classifierOptions = $classifierOptions;
}
/**
* @throws InvalidArgumentException
*/
public function train(array $samples, array $targets): void
{
// Initialize usual variables
$this->labels = array_keys(array_count_values($targets));
if (count($this->labels) !== 2) {
throw new InvalidArgumentException('AdaBoost is a binary classifier and can classify between two classes only');
}
// Set all target values to either -1 or 1
$this->labels = [
1 => $this->labels[0],
-1 => $this->labels[1],
];
foreach ($targets as $target) {
$this->targets[] = $target == $this->labels[1] ? 1 : -1;
}
$this->samples = array_merge($this->samples, $samples);
$this->featureCount = count($samples[0]);
$this->sampleCount = count($this->samples);
// Initialize AdaBoost parameters
$this->weights = array_fill(0, $this->sampleCount, 1.0 / $this->sampleCount);
$this->classifiers = [];
$this->alpha = [];
// Execute the algorithm for a maximum number of iterations
$currIter = 0;
while ($this->maxIterations > $currIter++) {
// Determine the best 'weak' classifier based on current weights
$classifier = $this->getBestClassifier();
$errorRate = $this->evaluateClassifier($classifier);
// Update alpha & weight values at each iteration
$alpha = $this->calculateAlpha($errorRate);
$this->updateWeights($classifier, $alpha);
$this->classifiers[] = $classifier;
$this->alpha[] = $alpha;
}
}
/**
* @return mixed
*/
public function predictSample(array $sample)
{
$sum = 0;
foreach ($this->alpha as $index => $alpha) {
$h = $this->classifiers[$index]->predict($sample);
$sum += $h * $alpha;
}
return $this->labels[$sum > 0 ? 1 : -1];
}
/**
* Returns the classifier with the lowest error rate with the
* consideration of current sample weights
*/
protected function getBestClassifier(): Classifier
{
$ref = new ReflectionClass($this->baseClassifier);
/** @var Classifier $classifier */
$classifier = count($this->classifierOptions) === 0 ? $ref->newInstance() : $ref->newInstanceArgs($this->classifierOptions);
if ($classifier instanceof WeightedClassifier) {
$classifier->setSampleWeights($this->weights);
$classifier->train($this->samples, $this->targets);
} else {
[$samples, $targets] = $this->resample();
$classifier->train($samples, $targets);
}
return $classifier;
}
/**
* Resamples the dataset in accordance with the weights and
* returns the new dataset
*/
protected function resample(): array
{
$weights = $this->weights;
$std = StandardDeviation::population($weights);
$mean = Mean::arithmetic($weights);
$min = min($weights);
$minZ = (int) round(($min - $mean) / $std);
$samples = [];
$targets = [];
foreach ($weights as $index => $weight) {
$z = (int) round(($weight - $mean) / $std) - $minZ + 1;
for ($i = 0; $i < $z; ++$i) {
if (random_int(0, 1) == 0) {
continue;
}
$samples[] = $this->samples[$index];
$targets[] = $this->targets[$index];
}
}
return [$samples, $targets];
}
/**
* Evaluates the classifier and returns the classification error rate
*/
protected function evaluateClassifier(Classifier $classifier): float
{
$total = (float) array_sum($this->weights);
$wrong = 0;
foreach ($this->samples as $index => $sample) {
$predicted = $classifier->predict($sample);
if ($predicted != $this->targets[$index]) {
$wrong += $this->weights[$index];
}
}
return $wrong / $total;
}
/**
* Calculates alpha of a classifier
*/
protected function calculateAlpha(float $errorRate): float
{
if ($errorRate == 0) {
$errorRate = 1e-10;
}
return 0.5 * log((1 - $errorRate) / $errorRate);
}
/**
* Updates the sample weights
*/
protected function updateWeights(Classifier $classifier, float $alpha): void
{
$sumOfWeights = array_sum($this->weights);
$weightsT1 = [];
foreach ($this->weights as $index => $weight) {
$desired = $this->targets[$index];
$output = $classifier->predict($this->samples[$index]);
$weight *= exp(-$alpha * $desired * $output) / $sumOfWeights;
$weightsT1[] = $weight;
}
$this->weights = $weightsT1;
}
}
@@ -0,0 +1,170 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification\Ensemble;
use Phpml\Classification\Classifier;
use Phpml\Classification\DecisionTree;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
use ReflectionClass;
class Bagging implements Classifier
{
use Trainable;
use Predictable;
/**
* @var int
*/
protected $numSamples;
/**
* @var int
*/
protected $featureCount = 0;
/**
* @var int
*/
protected $numClassifier;
/**
* @var string
*/
protected $classifier = DecisionTree::class;
/**
* @var array
*/
protected $classifierOptions = ['maxDepth' => 20];
/**
* @var array
*/
protected $classifiers = [];
/**
* @var float
*/
protected $subsetRatio = 0.7;
/**
* Creates an ensemble classifier with given number of base classifiers
* Default number of base classifiers is 50.
* The more number of base classifiers, the better performance but at the cost of procesing time
*/
public function __construct(int $numClassifier = 50)
{
$this->numClassifier = $numClassifier;
}
/**
* This method determines the ratio of samples used to create the 'bootstrap' subset,
* e.g., random samples drawn from the original dataset with replacement (allow repeats),
* to train each base classifier.
*
* @return $this
*
* @throws InvalidArgumentException
*/
public function setSubsetRatio(float $ratio)
{
if ($ratio < 0.1 || $ratio > 1.0) {
throw new InvalidArgumentException('Subset ratio should be between 0.1 and 1.0');
}
$this->subsetRatio = $ratio;
return $this;
}
/**
* This method is used to set the base classifier. Default value is
* DecisionTree::class, but any class that implements the <i>Classifier</i>
* can be used. <br>
* While giving the parameters of the classifier, the values should be
* given in the order they are in the constructor of the classifier and parameter
* names are neglected.
*
* @return $this
*/
public function setClassifer(string $classifier, array $classifierOptions = [])
{
$this->classifier = $classifier;
$this->classifierOptions = $classifierOptions;
return $this;
}
public function train(array $samples, array $targets): void
{
$this->samples = array_merge($this->samples, $samples);
$this->targets = array_merge($this->targets, $targets);
$this->featureCount = count($samples[0]);
$this->numSamples = count($this->samples);
// Init classifiers and train them with bootstrap samples
$this->classifiers = $this->initClassifiers();
$index = 0;
foreach ($this->classifiers as $classifier) {
[$samples, $targets] = $this->getRandomSubset($index);
$classifier->train($samples, $targets);
++$index;
}
}
protected function getRandomSubset(int $index): array
{
$samples = [];
$targets = [];
srand($index);
$bootstrapSize = $this->subsetRatio * $this->numSamples;
for ($i = 0; $i < $bootstrapSize; ++$i) {
$rand = random_int(0, $this->numSamples - 1);
$samples[] = $this->samples[$rand];
$targets[] = $this->targets[$rand];
}
return [$samples, $targets];
}
protected function initClassifiers(): array
{
$classifiers = [];
for ($i = 0; $i < $this->numClassifier; ++$i) {
$ref = new ReflectionClass($this->classifier);
/** @var Classifier $obj */
$obj = count($this->classifierOptions) === 0 ? $ref->newInstance() : $ref->newInstanceArgs($this->classifierOptions);
$classifiers[] = $this->initSingleClassifier($obj);
}
return $classifiers;
}
protected function initSingleClassifier(Classifier $classifier): Classifier
{
return $classifier;
}
/**
* @return mixed
*/
protected function predictSample(array $sample)
{
$predictions = [];
foreach ($this->classifiers as $classifier) {
/** @var Classifier $classifier */
$predictions[] = $classifier->predict($sample);
}
$counts = array_count_values($predictions);
arsort($counts);
reset($counts);
return key($counts);
}
}
@@ -0,0 +1,157 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification\Ensemble;
use Phpml\Classification\Classifier;
use Phpml\Classification\DecisionTree;
use Phpml\Exception\InvalidArgumentException;
class RandomForest extends Bagging
{
/**
* @var float|string
*/
protected $featureSubsetRatio = 'log';
/**
* @var array|null
*/
protected $columnNames;
/**
* Initializes RandomForest with the given number of trees. More trees
* may increase the prediction performance while it will also substantially
* increase the processing time and the required memory
*/
public function __construct(int $numClassifier = 50)
{
parent::__construct($numClassifier);
$this->setSubsetRatio(1.0);
}
/**
* This method is used to determine how many of the original columns (features)
* will be used to construct subsets to train base classifiers.<br>
*
* Allowed values: 'sqrt', 'log' or any float number between 0.1 and 1.0 <br>
*
* Default value for the ratio is 'log' which results in log(numFeatures, 2) + 1
* features to be taken into consideration while selecting subspace of features
*
* @param mixed $ratio
*/
public function setFeatureSubsetRatio($ratio): self
{
if (!is_string($ratio) && !is_float($ratio)) {
throw new InvalidArgumentException('Feature subset ratio must be a string or a float');
}
if (is_float($ratio) && ($ratio < 0.1 || $ratio > 1.0)) {
throw new InvalidArgumentException('When a float is given, feature subset ratio should be between 0.1 and 1.0');
}
if (is_string($ratio) && $ratio !== 'sqrt' && $ratio !== 'log') {
throw new InvalidArgumentException("When a string is given, feature subset ratio can only be 'sqrt' or 'log'");
}
$this->featureSubsetRatio = $ratio;
return $this;
}
/**
* RandomForest algorithm is usable *only* with DecisionTree
*
* @return $this
*/
public function setClassifer(string $classifier, array $classifierOptions = [])
{
if ($classifier !== DecisionTree::class) {
throw new InvalidArgumentException('RandomForest can only use DecisionTree as base classifier');
}
parent::setClassifer($classifier, $classifierOptions);
return $this;
}
/**
* This will return an array including an importance value for
* each column in the given dataset. Importance values for a column
* is the average importance of that column in all trees in the forest
*/
public function getFeatureImportances(): array
{
// Traverse each tree and sum importance of the columns
$sum = [];
foreach ($this->classifiers as $tree) {
/** @var DecisionTree $tree */
$importances = $tree->getFeatureImportances();
foreach ($importances as $column => $importance) {
if (array_key_exists($column, $sum)) {
$sum[$column] += $importance;
} else {
$sum[$column] = $importance;
}
}
}
// Normalize & sort the importance values
$total = array_sum($sum);
array_walk($sum, function (&$importance) use ($total): void {
$importance /= $total;
});
arsort($sum);
return $sum;
}
/**
* A string array to represent the columns is given. They are useful
* when trying to print some information about the trees such as feature importances
*
* @return $this
*/
public function setColumnNames(array $names)
{
$this->columnNames = $names;
return $this;
}
/**
* @return DecisionTree
*/
protected function initSingleClassifier(Classifier $classifier): Classifier
{
if (!$classifier instanceof DecisionTree) {
throw new InvalidArgumentException(
sprintf('Classifier %s expected, got %s', DecisionTree::class, get_class($classifier))
);
}
if (is_float($this->featureSubsetRatio)) {
$featureCount = (int) ($this->featureSubsetRatio * $this->featureCount);
} elseif ($this->featureSubsetRatio === 'sqrt') {
$featureCount = (int) ($this->featureCount ** .5) + 1;
} else {
$featureCount = (int) log($this->featureCount, 2) + 1;
}
if ($featureCount >= $this->featureCount) {
$featureCount = $this->featureCount;
}
if ($this->columnNames === null) {
$this->columnNames = range(0, $this->featureCount - 1);
}
return $classifier
->setColumnNames($this->columnNames)
->setNumFeatures($featureCount);
}
}
@@ -0,0 +1,75 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
use Phpml\Math\Distance;
use Phpml\Math\Distance\Euclidean;
class KNearestNeighbors implements Classifier
{
use Trainable;
use Predictable;
/**
* @var int
*/
private $k;
/**
* @var Distance
*/
private $distanceMetric;
/**
* @param Distance|null $distanceMetric (if null then Euclidean distance as default)
*/
public function __construct(int $k = 3, ?Distance $distanceMetric = null)
{
if ($distanceMetric === null) {
$distanceMetric = new Euclidean();
}
$this->k = $k;
$this->samples = [];
$this->targets = [];
$this->distanceMetric = $distanceMetric;
}
/**
* @return mixed
*/
protected function predictSample(array $sample)
{
$distances = $this->kNeighborsDistances($sample);
$predictions = (array) array_combine(array_values($this->targets), array_fill(0, count($this->targets), 0));
foreach (array_keys($distances) as $index) {
++$predictions[$this->targets[$index]];
}
arsort($predictions);
reset($predictions);
return key($predictions);
}
/**
* @throws \Phpml\Exception\InvalidArgumentException
*/
private function kNeighborsDistances(array $sample): array
{
$distances = [];
foreach ($this->samples as $index => $neighbor) {
$distances[$index] = $this->distanceMetric->distance($sample, $neighbor);
}
asort($distances);
return array_slice($distances, 0, $this->k, true);
}
}
@@ -0,0 +1,75 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification\Linear;
use Phpml\Exception\InvalidArgumentException;
class Adaline extends Perceptron
{
/**
* Batch training is the default Adaline training algorithm
*/
public const BATCH_TRAINING = 1;
/**
* Online training: Stochastic gradient descent learning
*/
public const ONLINE_TRAINING = 2;
/**
* Training type may be either 'Batch' or 'Online' learning
*
* @var string|int
*/
protected $trainingType;
/**
* Initalize an Adaline (ADAptive LInear NEuron) classifier with given learning rate and maximum
* number of iterations used while training the classifier <br>
*
* Learning rate should be a float value between 0.0(exclusive) and 1.0 (inclusive) <br>
* Maximum number of iterations can be an integer value greater than 0 <br>
* If normalizeInputs is set to true, then every input given to the algorithm will be standardized
* by use of standard deviation and mean calculation
*
* @throws InvalidArgumentException
*/
public function __construct(
float $learningRate = 0.001,
int $maxIterations = 1000,
bool $normalizeInputs = true,
int $trainingType = self::BATCH_TRAINING
) {
if (!in_array($trainingType, [self::BATCH_TRAINING, self::ONLINE_TRAINING], true)) {
throw new InvalidArgumentException('Adaline can only be trained with batch and online/stochastic gradient descent algorithm');
}
$this->trainingType = $trainingType;
parent::__construct($learningRate, $maxIterations, $normalizeInputs);
}
/**
* Adapts the weights with respect to given samples and targets
* by use of gradient descent learning rule
*/
protected function runTraining(array $samples, array $targets): void
{
// The cost function is the sum of squares
$callback = function ($weights, $sample, $target): array {
$this->weights = $weights;
$output = $this->output($sample);
$gradient = $output - $target;
$error = $gradient ** 2;
return [$error, $gradient];
};
$isBatch = $this->trainingType == self::BATCH_TRAINING;
parent::runGradientDescent($samples, $targets, $callback, $isBatch);
}
}
@@ -0,0 +1,319 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification\Linear;
use Phpml\Classification\DecisionTree;
use Phpml\Classification\WeightedClassifier;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\OneVsRest;
use Phpml\Helper\Predictable;
use Phpml\Math\Comparison;
class DecisionStump extends WeightedClassifier
{
use Predictable;
use OneVsRest;
public const AUTO_SELECT = -1;
/**
* @var int
*/
protected $givenColumnIndex;
/**
* @var array
*/
protected $binaryLabels = [];
/**
* Lowest error rate obtained while training/optimizing the model
*
* @var float
*/
protected $trainingErrorRate;
/**
* @var int
*/
protected $column;
/**
* @var mixed
*/
protected $value;
/**
* @var string
*/
protected $operator;
/**
* @var array
*/
protected $columnTypes = [];
/**
* @var int
*/
protected $featureCount;
/**
* @var float
*/
protected $numSplitCount = 100.0;
/**
* Distribution of samples in the leaves
*
* @var array
*/
protected $prob = [];
/**
* A DecisionStump classifier is a one-level deep DecisionTree. It is generally
* used with ensemble algorithms as in the weak classifier role. <br>
*
* If columnIndex is given, then the stump tries to produce a decision node
* on this column, otherwise in cases given the value of -1, the stump itself
* decides which column to take for the decision (Default DecisionTree behaviour)
*/
public function __construct(int $columnIndex = self::AUTO_SELECT)
{
$this->givenColumnIndex = $columnIndex;
}
public function __toString(): string
{
return "IF {$this->column} {$this->operator} {$this->value} ".
'THEN '.$this->binaryLabels[0].' '.
'ELSE '.$this->binaryLabels[1];
}
/**
* While finding best split point for a numerical valued column,
* DecisionStump looks for equally distanced values between minimum and maximum
* values in the column. Given <i>$count</i> value determines how many split
* points to be probed. The more split counts, the better performance but
* worse processing time (Default value is 10.0)
*/
public function setNumericalSplitCount(float $count): void
{
$this->numSplitCount = $count;
}
/**
* @throws InvalidArgumentException
*/
protected function trainBinary(array $samples, array $targets, array $labels): void
{
$this->binaryLabels = $labels;
$this->featureCount = count($samples[0]);
// If a column index is given, it should be among the existing columns
if ($this->givenColumnIndex > count($samples[0]) - 1) {
$this->givenColumnIndex = self::AUTO_SELECT;
}
// Check the size of the weights given.
// If none given, then assign 1 as a weight to each sample
if (count($this->weights) === 0) {
$this->weights = array_fill(0, count($samples), 1);
} else {
$numWeights = count($this->weights);
if ($numWeights !== count($samples)) {
throw new InvalidArgumentException('Number of sample weights does not match with number of samples');
}
}
// Determine type of each column as either "continuous" or "nominal"
$this->columnTypes = DecisionTree::getColumnTypes($samples);
// Try to find the best split in the columns of the dataset
// by calculating error rate for each split point in each column
$columns = range(0, count($samples[0]) - 1);
if ($this->givenColumnIndex !== self::AUTO_SELECT) {
$columns = [$this->givenColumnIndex];
}
$bestSplit = [
'value' => 0,
'operator' => '',
'prob' => [],
'column' => 0,
'trainingErrorRate' => 1.0,
];
foreach ($columns as $col) {
if ($this->columnTypes[$col] == DecisionTree::CONTINUOUS) {
$split = $this->getBestNumericalSplit($samples, $targets, $col);
} else {
$split = $this->getBestNominalSplit($samples, $targets, $col);
}
if ($split['trainingErrorRate'] < $bestSplit['trainingErrorRate']) {
$bestSplit = $split;
}
}
// Assign determined best values to the stump
foreach ($bestSplit as $name => $value) {
$this->{$name} = $value;
}
}
/**
* Determines best split point for the given column
*/
protected function getBestNumericalSplit(array $samples, array $targets, int $col): array
{
$values = array_column($samples, $col);
// Trying all possible points may be accomplished in two general ways:
// 1- Try all values in the $samples array ($values)
// 2- Artificially split the range of values into several parts and try them
// We choose the second one because it is faster in larger datasets
$minValue = min($values);
$maxValue = max($values);
$stepSize = ($maxValue - $minValue) / $this->numSplitCount;
$split = [];
foreach (['<=', '>'] as $operator) {
// Before trying all possible split points, let's first try
// the average value for the cut point
$threshold = array_sum($values) / (float) count($values);
[$errorRate, $prob] = $this->calculateErrorRate($targets, $threshold, $operator, $values);
if (!isset($split['trainingErrorRate']) || $errorRate < $split['trainingErrorRate']) {
$split = [
'value' => $threshold,
'operator' => $operator,
'prob' => $prob,
'column' => $col,
'trainingErrorRate' => $errorRate,
];
}
// Try other possible points one by one
for ($step = $minValue; $step <= $maxValue; $step += $stepSize) {
$threshold = (float) $step;
[$errorRate, $prob] = $this->calculateErrorRate($targets, $threshold, $operator, $values);
if ($errorRate < $split['trainingErrorRate']) {
$split = [
'value' => $threshold,
'operator' => $operator,
'prob' => $prob,
'column' => $col,
'trainingErrorRate' => $errorRate,
];
}
}// for
}
return $split;
}
protected function getBestNominalSplit(array $samples, array $targets, int $col): array
{
$values = array_column($samples, $col);
$valueCounts = array_count_values($values);
$distinctVals = array_keys($valueCounts);
$split = [];
foreach (['=', '!='] as $operator) {
foreach ($distinctVals as $val) {
[$errorRate, $prob] = $this->calculateErrorRate($targets, $val, $operator, $values);
if (!isset($split['trainingErrorRate']) || $split['trainingErrorRate'] < $errorRate) {
$split = [
'value' => $val,
'operator' => $operator,
'prob' => $prob,
'column' => $col,
'trainingErrorRate' => $errorRate,
];
}
}
}
return $split;
}
/**
* Calculates the ratio of wrong predictions based on the new threshold
* value given as the parameter
*/
protected function calculateErrorRate(array $targets, float $threshold, string $operator, array $values): array
{
$wrong = 0.0;
$prob = [];
$leftLabel = $this->binaryLabels[0];
$rightLabel = $this->binaryLabels[1];
foreach ($values as $index => $value) {
if (Comparison::compare($value, $threshold, $operator)) {
$predicted = $leftLabel;
} else {
$predicted = $rightLabel;
}
$target = $targets[$index];
if ((string) $predicted != (string) $targets[$index]) {
$wrong += $this->weights[$index];
}
if (!isset($prob[$predicted][$target])) {
$prob[$predicted][$target] = 0;
}
++$prob[$predicted][$target];
}
// Calculate probabilities: Proportion of labels in each leaf
$dist = array_combine($this->binaryLabels, array_fill(0, 2, 0.0));
foreach ($prob as $leaf => $counts) {
$leafTotal = (float) array_sum($prob[$leaf]);
foreach ($counts as $label => $count) {
if ((string) $leaf == (string) $label) {
$dist[$leaf] = $count / $leafTotal;
}
}
}
return [$wrong / (float) array_sum($this->weights), $dist];
}
/**
* Returns the probability of the sample of belonging to the given label
*
* Probability of a sample is calculated as the proportion of the label
* within the labels of the training samples in the decision node
*
* @param mixed $label
*/
protected function predictProbability(array $sample, $label): float
{
$predicted = $this->predictSampleBinary($sample);
if ((string) $predicted == (string) $label) {
return $this->prob[$label];
}
return 0.0;
}
/**
* @return mixed
*/
protected function predictSampleBinary(array $sample)
{
if (Comparison::compare($sample[$this->column], $this->value, $this->operator)) {
return $this->binaryLabels[0];
}
return $this->binaryLabels[1];
}
protected function resetBinary(): void
{
}
}
@@ -0,0 +1,283 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification\Linear;
use Closure;
use Exception;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\Optimizer\ConjugateGradient;
class LogisticRegression extends Adaline
{
/**
* Batch training: Gradient descent algorithm (default)
*/
public const BATCH_TRAINING = 1;
/**
* Online training: Stochastic gradient descent learning
*/
public const ONLINE_TRAINING = 2;
/**
* Conjugate Batch: Conjugate Gradient algorithm
*/
public const CONJUGATE_GRAD_TRAINING = 3;
/**
* Cost function to optimize: 'log' and 'sse' are supported <br>
* - 'log' : log likelihood <br>
* - 'sse' : sum of squared errors <br>
*
* @var string
*/
protected $costFunction = 'log';
/**
* Regularization term: only 'L2' is supported
*
* @var string
*/
protected $penalty = 'L2';
/**
* Lambda (λ) parameter of regularization term. If λ is set to 0, then
* regularization term is cancelled.
*
* @var float
*/
protected $lambda = 0.5;
/**
* Initalize a Logistic Regression classifier with maximum number of iterations
* and learning rule to be applied <br>
*
* Maximum number of iterations can be an integer value greater than 0 <br>
* If normalizeInputs is set to true, then every input given to the algorithm will be standardized
* by use of standard deviation and mean calculation <br>
*
* Cost function can be 'log' for log-likelihood and 'sse' for sum of squared errors <br>
*
* Penalty (Regularization term) can be 'L2' or empty string to cancel penalty term
*
* @throws InvalidArgumentException
*/
public function __construct(
int $maxIterations = 500,
bool $normalizeInputs = true,
int $trainingType = self::CONJUGATE_GRAD_TRAINING,
string $cost = 'log',
string $penalty = 'L2'
) {
$trainingTypes = range(self::BATCH_TRAINING, self::CONJUGATE_GRAD_TRAINING);
if (!in_array($trainingType, $trainingTypes, true)) {
throw new InvalidArgumentException(
'Logistic regression can only be trained with '.
'batch (gradient descent), online (stochastic gradient descent) '.
'or conjugate batch (conjugate gradients) algorithms'
);
}
if (!in_array($cost, ['log', 'sse'], true)) {
throw new InvalidArgumentException(
"Logistic regression cost function can be one of the following: \n".
"'log' for log-likelihood and 'sse' for sum of squared errors"
);
}
if ($penalty !== '' && strtoupper($penalty) !== 'L2') {
throw new InvalidArgumentException('Logistic regression supports only \'L2\' regularization');
}
$this->learningRate = 0.001;
parent::__construct($this->learningRate, $maxIterations, $normalizeInputs);
$this->trainingType = $trainingType;
$this->costFunction = $cost;
$this->penalty = $penalty;
}
/**
* Sets the learning rate if gradient descent algorithm is
* selected for training
*/
public function setLearningRate(float $learningRate): void
{
$this->learningRate = $learningRate;
}
/**
* Lambda (λ) parameter of regularization term. If 0 is given,
* then the regularization term is cancelled
*/
public function setLambda(float $lambda): void
{
$this->lambda = $lambda;
}
/**
* Adapts the weights with respect to given samples and targets
* by use of selected solver
*
* @throws \Exception
*/
protected function runTraining(array $samples, array $targets): void
{
$callback = $this->getCostFunction();
switch ($this->trainingType) {
case self::BATCH_TRAINING:
$this->runGradientDescent($samples, $targets, $callback, true);
return;
case self::ONLINE_TRAINING:
$this->runGradientDescent($samples, $targets, $callback, false);
return;
case self::CONJUGATE_GRAD_TRAINING:
$this->runConjugateGradient($samples, $targets, $callback);
return;
default:
// Not reached
throw new Exception(sprintf('Logistic regression has invalid training type: %d.', $this->trainingType));
}
}
/**
* Executes Conjugate Gradient method to optimize the weights of the LogReg model
*/
protected function runConjugateGradient(array $samples, array $targets, Closure $gradientFunc): void
{
if ($this->optimizer === null) {
$this->optimizer = (new ConjugateGradient($this->featureCount))
->setMaxIterations($this->maxIterations);
}
$this->weights = $this->optimizer->runOptimization($samples, $targets, $gradientFunc);
$this->costValues = $this->optimizer->getCostValues();
}
/**
* Returns the appropriate callback function for the selected cost function
*
* @throws \Exception
*/
protected function getCostFunction(): Closure
{
$penalty = 0;
if ($this->penalty === 'L2') {
$penalty = $this->lambda;
}
switch ($this->costFunction) {
case 'log':
/*
* Negative of Log-likelihood cost function to be minimized:
* J(x) = ∑( - y . log(h(x)) - (1 - y) . log(1 - h(x)))
*
* If regularization term is given, then it will be added to the cost:
* for L2 : J(x) = J(x) + λ/m . w
*
* The gradient of the cost function to be used with gradient descent:
* ∇J(x) = -(y - h(x)) = (h(x) - y)
*/
return function ($weights, $sample, $y) use ($penalty): array {
$this->weights = $weights;
$hX = $this->output($sample);
// In cases where $hX = 1 or $hX = 0, the log-likelihood
// value will give a NaN, so we fix these values
if ($hX == 1) {
$hX = 1 - 1e-10;
}
if ($hX == 0) {
$hX = 1e-10;
}
$y = $y < 0 ? 0 : 1;
$error = -$y * log($hX) - (1 - $y) * log(1 - $hX);
$gradient = $hX - $y;
return [$error, $gradient, $penalty];
};
case 'sse':
/*
* Sum of squared errors or least squared errors cost function:
* J(x) = ∑ (y - h(x))^2
*
* If regularization term is given, then it will be added to the cost:
* for L2 : J(x) = J(x) + λ/m . w
*
* The gradient of the cost function:
* ∇J(x) = -(h(x) - y) . h(x) . (1 - h(x))
*/
return function ($weights, $sample, $y) use ($penalty): array {
$this->weights = $weights;
$hX = $this->output($sample);
$y = $y < 0 ? 0 : 1;
$error = (($y - $hX) ** 2);
$gradient = -($y - $hX) * $hX * (1 - $hX);
return [$error, $gradient, $penalty];
};
default:
// Not reached
throw new Exception(sprintf('Logistic regression has invalid cost function: %s.', $this->costFunction));
}
}
/**
* Returns the output of the network, a float value between 0.0 and 1.0
*/
protected function output(array $sample): float
{
$sum = parent::output($sample);
return 1.0 / (1.0 + exp(-$sum));
}
/**
* Returns the class value (either -1 or 1) for the given input
*/
protected function outputClass(array $sample): int
{
$output = $this->output($sample);
if ($output > 0.5) {
return 1;
}
return -1;
}
/**
* Returns the probability of the sample of belonging to the given label.
*
* The probability is simply taken as the distance of the sample
* to the decision plane.
*
* @param mixed $label
*/
protected function predictProbability(array $sample, $label): float
{
$sample = $this->checkNormalizedSample($sample);
$probability = $this->output($sample);
if (array_search($label, $this->labels, true) > 0) {
return $probability;
}
return 1 - $probability;
}
}
@@ -0,0 +1,264 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification\Linear;
use Closure;
use Phpml\Classification\Classifier;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\OneVsRest;
use Phpml\Helper\Optimizer\GD;
use Phpml\Helper\Optimizer\Optimizer;
use Phpml\Helper\Optimizer\StochasticGD;
use Phpml\Helper\Predictable;
use Phpml\IncrementalEstimator;
use Phpml\Preprocessing\Normalizer;
class Perceptron implements Classifier, IncrementalEstimator
{
use Predictable;
use OneVsRest;
/**
* @var Optimizer|GD|StochasticGD|null
*/
protected $optimizer;
/**
* @var array
*/
protected $labels = [];
/**
* @var int
*/
protected $featureCount = 0;
/**
* @var array
*/
protected $weights = [];
/**
* @var float
*/
protected $learningRate;
/**
* @var int
*/
protected $maxIterations;
/**
* @var Normalizer
*/
protected $normalizer;
/**
* @var bool
*/
protected $enableEarlyStop = true;
/**
* Initalize a perceptron classifier with given learning rate and maximum
* number of iterations used while training the perceptron
*
* @param float $learningRate Value between 0.0(exclusive) and 1.0(inclusive)
* @param int $maxIterations Must be at least 1
*
* @throws InvalidArgumentException
*/
public function __construct(float $learningRate = 0.001, int $maxIterations = 1000, bool $normalizeInputs = true)
{
if ($learningRate <= 0.0 || $learningRate > 1.0) {
throw new InvalidArgumentException('Learning rate should be a float value between 0.0(exclusive) and 1.0(inclusive)');
}
if ($maxIterations <= 0) {
throw new InvalidArgumentException('Maximum number of iterations must be an integer greater than 0');
}
if ($normalizeInputs) {
$this->normalizer = new Normalizer(Normalizer::NORM_STD);
}
$this->learningRate = $learningRate;
$this->maxIterations = $maxIterations;
}
public function partialTrain(array $samples, array $targets, array $labels = []): void
{
$this->trainByLabel($samples, $targets, $labels);
}
public function trainBinary(array $samples, array $targets, array $labels): void
{
if ($this->normalizer !== null) {
$this->normalizer->transform($samples);
}
// Set all target values to either -1 or 1
$this->labels = [
1 => $labels[0],
-1 => $labels[1],
];
foreach ($targets as $key => $target) {
$targets[$key] = (string) $target == (string) $this->labels[1] ? 1 : -1;
}
// Set samples and feature count vars
$this->featureCount = count($samples[0]);
$this->runTraining($samples, $targets);
}
/**
* Normally enabling early stopping for the optimization procedure may
* help saving processing time while in some cases it may result in
* premature convergence.<br>
*
* If "false" is given, the optimization procedure will always be executed
* for $maxIterations times
*
* @return $this
*/
public function setEarlyStop(bool $enable = true)
{
$this->enableEarlyStop = $enable;
return $this;
}
/**
* Returns the cost values obtained during the training.
*/
public function getCostValues(): array
{
return $this->costValues;
}
protected function resetBinary(): void
{
$this->labels = [];
$this->optimizer = null;
$this->featureCount = 0;
$this->weights = [];
$this->costValues = [];
}
/**
* Trains the perceptron model with Stochastic Gradient Descent optimization
* to get the correct set of weights
*/
protected function runTraining(array $samples, array $targets): void
{
// The cost function is the sum of squares
$callback = function ($weights, $sample, $target): array {
$this->weights = $weights;
$prediction = $this->outputClass($sample);
$gradient = $prediction - $target;
$error = $gradient ** 2;
return [$error, $gradient];
};
$this->runGradientDescent($samples, $targets, $callback);
}
/**
* Executes a Gradient Descent algorithm for
* the given cost function
*/
protected function runGradientDescent(array $samples, array $targets, Closure $gradientFunc, bool $isBatch = false): void
{
$class = $isBatch ? GD::class : StochasticGD::class;
if ($this->optimizer === null) {
$this->optimizer = (new $class($this->featureCount))
->setLearningRate($this->learningRate)
->setMaxIterations($this->maxIterations)
->setChangeThreshold(1e-6)
->setEarlyStop($this->enableEarlyStop);
}
$this->weights = $this->optimizer->runOptimization($samples, $targets, $gradientFunc);
$this->costValues = $this->optimizer->getCostValues();
}
/**
* Checks if the sample should be normalized and if so, returns the
* normalized sample
*/
protected function checkNormalizedSample(array $sample): array
{
if ($this->normalizer !== null) {
$samples = [$sample];
$this->normalizer->transform($samples);
$sample = $samples[0];
}
return $sample;
}
/**
* Calculates net output of the network as a float value for the given input
*
* @return int|float
*/
protected function output(array $sample)
{
$sum = 0;
foreach ($this->weights as $index => $w) {
if ($index == 0) {
$sum += $w;
} else {
$sum += $w * $sample[$index - 1];
}
}
return $sum;
}
/**
* Returns the class value (either -1 or 1) for the given input
*/
protected function outputClass(array $sample): int
{
return $this->output($sample) > 0 ? 1 : -1;
}
/**
* Returns the probability of the sample of belonging to the given label.
*
* The probability is simply taken as the distance of the sample
* to the decision plane.
*
* @param mixed $label
*/
protected function predictProbability(array $sample, $label): float
{
$predicted = $this->predictSampleBinary($sample);
if ((string) $predicted == (string) $label) {
$sample = $this->checkNormalizedSample($sample);
return (float) abs($this->output($sample));
}
return 0.0;
}
/**
* @return mixed
*/
protected function predictSampleBinary(array $sample)
{
$sample = $this->checkNormalizedSample($sample);
$predictedClass = $this->outputClass($sample);
return $this->labels[$predictedClass];
}
}
@@ -0,0 +1,58 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification;
use Phpml\Exception\InvalidArgumentException;
use Phpml\NeuralNetwork\Network\MultilayerPerceptron;
class MLPClassifier extends MultilayerPerceptron implements Classifier
{
/**
* @param mixed $target
*
* @throws InvalidArgumentException
*/
public function getTargetClass($target): int
{
if (!in_array($target, $this->classes, true)) {
throw new InvalidArgumentException(
sprintf('Target with value "%s" is not part of the accepted classes', $target)
);
}
return array_search($target, $this->classes, true);
}
/**
* @return mixed
*/
protected function predictSample(array $sample)
{
$output = $this->setInput($sample)->getOutput();
$predictedClass = null;
$max = 0;
foreach ($output as $class => $value) {
if ($value > $max) {
$predictedClass = $class;
$max = $value;
}
}
return $predictedClass;
}
/**
* @param mixed $target
*/
protected function trainSample(array $sample, $target): void
{
// Feed-forward.
$this->setInput($sample);
// Back-propagate.
$this->backpropagation->backpropagate($this->getLayers(), $this->getTargetClass($target));
}
}
@@ -0,0 +1,184 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
use Phpml\Math\Statistic\Mean;
use Phpml\Math\Statistic\StandardDeviation;
class NaiveBayes implements Classifier
{
use Trainable;
use Predictable;
public const CONTINUOS = 1;
public const NOMINAL = 2;
public const EPSILON = 1e-10;
/**
* @var array
*/
private $std = [];
/**
* @var array
*/
private $mean = [];
/**
* @var array
*/
private $discreteProb = [];
/**
* @var array
*/
private $dataType = [];
/**
* @var array
*/
private $p = [];
/**
* @var int
*/
private $sampleCount = 0;
/**
* @var int
*/
private $featureCount = 0;
/**
* @var array
*/
private $labels = [];
public function train(array $samples, array $targets): void
{
$this->samples = array_merge($this->samples, $samples);
$this->targets = array_merge($this->targets, $targets);
$this->sampleCount = count($this->samples);
$this->featureCount = count($this->samples[0]);
$this->labels = array_map('strval', array_flip(array_flip($this->targets)));
foreach ($this->labels as $label) {
$samples = $this->getSamplesByLabel($label);
$this->p[$label] = count($samples) / $this->sampleCount;
$this->calculateStatistics($label, $samples);
}
}
/**
* @return mixed
*/
protected function predictSample(array $sample)
{
// Use NaiveBayes assumption for each label using:
// P(label|features) = P(label) * P(feature0|label) * P(feature1|label) .... P(featureN|label)
// Then compare probability for each class to determine which label is most likely
$predictions = [];
foreach ($this->labels as $label) {
$p = $this->p[$label];
for ($i = 0; $i < $this->featureCount; ++$i) {
$Plf = $this->sampleProbability($sample, $i, $label);
$p += $Plf;
}
$predictions[$label] = $p;
}
arsort($predictions, SORT_NUMERIC);
reset($predictions);
return key($predictions);
}
/**
* Calculates vital statistics for each label & feature. Stores these
* values in private array in order to avoid repeated calculation
*/
private function calculateStatistics(string $label, array $samples): void
{
$this->std[$label] = array_fill(0, $this->featureCount, 0);
$this->mean[$label] = array_fill(0, $this->featureCount, 0);
$this->dataType[$label] = array_fill(0, $this->featureCount, self::CONTINUOS);
$this->discreteProb[$label] = array_fill(0, $this->featureCount, self::CONTINUOS);
for ($i = 0; $i < $this->featureCount; ++$i) {
// Get the values of nth column in the samples array
// Mean::arithmetic is called twice, can be optimized
$values = array_column($samples, $i);
$numValues = count($values);
// if the values contain non-numeric data,
// then it should be treated as nominal/categorical/discrete column
if ($values !== array_filter($values, 'is_numeric')) {
$this->dataType[$label][$i] = self::NOMINAL;
$this->discreteProb[$label][$i] = array_count_values($values);
$db = &$this->discreteProb[$label][$i];
$db = array_map(function ($el) use ($numValues) {
return $el / $numValues;
}, $db);
} else {
$this->mean[$label][$i] = Mean::arithmetic($values);
// Add epsilon in order to avoid zero stdev
$this->std[$label][$i] = 1e-10 + StandardDeviation::population($values, false);
}
}
}
/**
* Calculates the probability P(label|sample_n)
*/
private function sampleProbability(array $sample, int $feature, string $label): float
{
if (!isset($sample[$feature])) {
throw new InvalidArgumentException('Missing feature. All samples must have equal number of features');
}
$value = $sample[$feature];
if ($this->dataType[$label][$feature] == self::NOMINAL) {
if (!isset($this->discreteProb[$label][$feature][$value]) ||
$this->discreteProb[$label][$feature][$value] == 0) {
return self::EPSILON;
}
return $this->discreteProb[$label][$feature][$value];
}
$std = $this->std[$label][$feature];
$mean = $this->mean[$label][$feature];
// Calculate the probability density by use of normal/Gaussian distribution
// Ref: https://en.wikipedia.org/wiki/Normal_distribution
//
// In order to avoid numerical errors because of small or zero values,
// some libraries adopt taking log of calculations such as
// scikit-learn did.
// (See : https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/naive_bayes.py)
$pdf = -0.5 * log(2.0 * M_PI * $std * $std);
$pdf -= 0.5 * (($value - $mean) ** 2) / ($std * $std);
return $pdf;
}
/**
* Return samples belonging to specific label
*/
private function getSamplesByLabel(string $label): array
{
$samples = [];
for ($i = 0; $i < $this->sampleCount; ++$i) {
if ($this->targets[$i] == $label) {
$samples[] = $this->samples[$i];
}
}
return $samples;
}
}
@@ -0,0 +1,26 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification;
use Phpml\SupportVectorMachine\Kernel;
use Phpml\SupportVectorMachine\SupportVectorMachine;
use Phpml\SupportVectorMachine\Type;
class SVC extends SupportVectorMachine implements Classifier
{
public function __construct(
int $kernel = Kernel::RBF,
float $cost = 1.0,
int $degree = 3,
?float $gamma = null,
float $coef0 = 0.0,
float $tolerance = 0.001,
int $cacheSize = 100,
bool $shrinking = true,
bool $probabilityEstimates = false
) {
parent::__construct(Type::C_SVC, $kernel, $cost, 0.5, $degree, $gamma, $coef0, 0.1, $tolerance, $cacheSize, $shrinking, $probabilityEstimates);
}
}
@@ -0,0 +1,21 @@
<?php
declare(strict_types=1);
namespace Phpml\Classification;
abstract class WeightedClassifier implements Classifier
{
/**
* @var array
*/
protected $weights = [];
/**
* Sets the array including a weight for each sample
*/
public function setSampleWeights(array $weights): void
{
$this->weights = $weights;
}
}
@@ -0,0 +1,10 @@
<?php
declare(strict_types=1);
namespace Phpml\Clustering;
interface Clusterer
{
public function cluster(array $samples): array;
}
@@ -0,0 +1,120 @@
<?php
declare(strict_types=1);
namespace Phpml\Clustering;
use Phpml\Math\Distance;
use Phpml\Math\Distance\Euclidean;
class DBSCAN implements Clusterer
{
private const NOISE = -1;
/**
* @var float
*/
private $epsilon;
/**
* @var int
*/
private $minSamples;
/**
* @var Distance
*/
private $distanceMetric;
public function __construct(float $epsilon = 0.5, int $minSamples = 3, ?Distance $distanceMetric = null)
{
if ($distanceMetric === null) {
$distanceMetric = new Euclidean();
}
$this->epsilon = $epsilon;
$this->minSamples = $minSamples;
$this->distanceMetric = $distanceMetric;
}
public function cluster(array $samples): array
{
$labels = [];
$n = 0;
foreach ($samples as $index => $sample) {
if (isset($labels[$index])) {
continue;
}
$neighborIndices = $this->getIndicesInRegion($sample, $samples);
if (count($neighborIndices) < $this->minSamples) {
$labels[$index] = self::NOISE;
continue;
}
$labels[$index] = $n;
$this->expandCluster($samples, $neighborIndices, $labels, $n);
++$n;
}
return $this->groupByCluster($samples, $labels, $n);
}
private function expandCluster(array $samples, array $seeds, array &$labels, int $n): void
{
while (($index = array_pop($seeds)) !== null) {
if (isset($labels[$index])) {
if ($labels[$index] === self::NOISE) {
$labels[$index] = $n;
}
continue;
}
$labels[$index] = $n;
$sample = $samples[$index];
$neighborIndices = $this->getIndicesInRegion($sample, $samples);
if (count($neighborIndices) >= $this->minSamples) {
$seeds = array_unique(array_merge($seeds, $neighborIndices));
}
}
}
private function getIndicesInRegion(array $center, array $samples): array
{
$indices = [];
foreach ($samples as $index => $sample) {
if ($this->distanceMetric->distance($center, $sample) < $this->epsilon) {
$indices[] = $index;
}
}
return $indices;
}
private function groupByCluster(array $samples, array $labels, int $n): array
{
$clusters = array_fill(0, $n, []);
foreach ($samples as $index => $sample) {
if ($labels[$index] !== self::NOISE) {
$clusters[$labels[$index]][$index] = $sample;
}
}
// Reindex (i.e. to 0, 1, 2, ...) integer indices for backword compatibility
foreach ($clusters as $index => $cluster) {
$clusters[$index] = array_merge($cluster, []);
}
return $clusters;
}
}
@@ -0,0 +1,236 @@
<?php
declare(strict_types=1);
namespace Phpml\Clustering;
use Phpml\Clustering\KMeans\Cluster;
use Phpml\Clustering\KMeans\Point;
use Phpml\Clustering\KMeans\Space;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Math\Distance\Euclidean;
class FuzzyCMeans implements Clusterer
{
/**
* @var int
*/
private $clustersNumber;
/**
* @var Cluster[]
*/
private $clusters = [];
/**
* @var Space
*/
private $space;
/**
* @var float[][]
*/
private $membership = [];
/**
* @var float
*/
private $fuzziness;
/**
* @var float
*/
private $epsilon;
/**
* @var int
*/
private $maxIterations;
/**
* @var int
*/
private $sampleCount;
/**
* @var array
*/
private $samples = [];
/**
* @throws InvalidArgumentException
*/
public function __construct(int $clustersNumber, float $fuzziness = 2.0, float $epsilon = 1e-2, int $maxIterations = 100)
{
if ($clustersNumber <= 0) {
throw new InvalidArgumentException('Invalid clusters number');
}
$this->clustersNumber = $clustersNumber;
$this->fuzziness = $fuzziness;
$this->epsilon = $epsilon;
$this->maxIterations = $maxIterations;
}
public function getMembershipMatrix(): array
{
return $this->membership;
}
public function cluster(array $samples): array
{
// Initialize variables, clusters and membership matrix
$this->sampleCount = count($samples);
$this->samples = &$samples;
$this->space = new Space(count($samples[0]));
$this->initClusters();
// Our goal is minimizing the objective value while
// executing the clustering steps at a maximum number of iterations
$lastObjective = 0.0;
$iterations = 0;
do {
// Update the membership matrix and cluster centers, respectively
$this->updateMembershipMatrix();
$this->updateClusters();
// Calculate the new value of the objective function
$objectiveVal = $this->getObjective();
$difference = abs($lastObjective - $objectiveVal);
$lastObjective = $objectiveVal;
} while ($difference > $this->epsilon && $iterations++ <= $this->maxIterations);
// Attach (hard cluster) each data point to the nearest cluster
for ($k = 0; $k < $this->sampleCount; ++$k) {
$column = array_column($this->membership, $k);
arsort($column);
reset($column);
$cluster = $this->clusters[key($column)];
$cluster->attach(new Point($this->samples[$k]));
}
// Return grouped samples
$grouped = [];
foreach ($this->clusters as $cluster) {
$grouped[] = $cluster->getPoints();
}
return $grouped;
}
protected function initClusters(): void
{
// Membership array is a matrix of cluster number by sample counts
// We initilize the membership array with random values
$dim = $this->space->getDimension();
$this->generateRandomMembership($dim, $this->sampleCount);
$this->updateClusters();
}
protected function generateRandomMembership(int $rows, int $cols): void
{
$this->membership = [];
for ($i = 0; $i < $rows; ++$i) {
$row = [];
$total = 0.0;
for ($k = 0; $k < $cols; ++$k) {
$val = random_int(1, 5) / 10.0;
$row[] = $val;
$total += $val;
}
$this->membership[] = array_map(static function ($val) use ($total): float {
return $val / $total;
}, $row);
}
}
protected function updateClusters(): void
{
$dim = $this->space->getDimension();
if (count($this->clusters) === 0) {
for ($i = 0; $i < $this->clustersNumber; ++$i) {
$this->clusters[] = new Cluster($this->space, array_fill(0, $dim, 0.0));
}
}
for ($i = 0; $i < $this->clustersNumber; ++$i) {
$cluster = $this->clusters[$i];
$center = $cluster->getCoordinates();
for ($k = 0; $k < $dim; ++$k) {
$a = $this->getMembershipRowTotal($i, $k, true);
$b = $this->getMembershipRowTotal($i, $k, false);
$center[$k] = $a / $b;
}
$cluster->setCoordinates($center);
}
}
protected function getMembershipRowTotal(int $row, int $col, bool $multiply): float
{
$sum = 0.0;
for ($k = 0; $k < $this->sampleCount; ++$k) {
$val = $this->membership[$row][$k] ** $this->fuzziness;
if ($multiply) {
$val *= $this->samples[$k][$col];
}
$sum += $val;
}
return $sum;
}
protected function updateMembershipMatrix(): void
{
for ($i = 0; $i < $this->clustersNumber; ++$i) {
for ($k = 0; $k < $this->sampleCount; ++$k) {
$distCalc = $this->getDistanceCalc($i, $k);
$this->membership[$i][$k] = 1.0 / $distCalc;
}
}
}
protected function getDistanceCalc(int $row, int $col): float
{
$sum = 0.0;
$distance = new Euclidean();
$dist1 = $distance->distance(
$this->clusters[$row]->getCoordinates(),
$this->samples[$col]
);
for ($j = 0; $j < $this->clustersNumber; ++$j) {
$dist2 = $distance->distance(
$this->clusters[$j]->getCoordinates(),
$this->samples[$col]
);
$val = (($dist1 / $dist2) ** 2.0) / ($this->fuzziness - 1);
$sum += $val;
}
return $sum;
}
/**
* The objective is to minimize the distance between all data points
* and all cluster centers. This method returns the summation of all
* these distances
*/
protected function getObjective(): float
{
$sum = 0.0;
$distance = new Euclidean();
for ($i = 0; $i < $this->clustersNumber; ++$i) {
$clust = $this->clusters[$i]->getCoordinates();
for ($k = 0; $k < $this->sampleCount; ++$k) {
$point = $this->samples[$k];
$sum += $distance->distance($clust, $point);
}
}
return $sum;
}
}
@@ -0,0 +1,50 @@
<?php
declare(strict_types=1);
namespace Phpml\Clustering;
use Phpml\Clustering\KMeans\Space;
use Phpml\Exception\InvalidArgumentException;
class KMeans implements Clusterer
{
public const INIT_RANDOM = 1;
public const INIT_KMEANS_PLUS_PLUS = 2;
/**
* @var int
*/
private $clustersNumber;
/**
* @var int
*/
private $initialization;
public function __construct(int $clustersNumber, int $initialization = self::INIT_KMEANS_PLUS_PLUS)
{
if ($clustersNumber <= 0) {
throw new InvalidArgumentException('Invalid clusters number');
}
$this->clustersNumber = $clustersNumber;
$this->initialization = $initialization;
}
public function cluster(array $samples): array
{
$space = new Space(count(reset($samples)));
foreach ($samples as $key => $sample) {
$space->addPoint($sample, $key);
}
$clusters = [];
foreach ($space->cluster($this->clustersNumber, $this->initialization) as $cluster) {
$clusters[] = $cluster->getPoints();
}
return $clusters;
}
}
@@ -0,0 +1,117 @@
<?php
declare(strict_types=1);
namespace Phpml\Clustering\KMeans;
use IteratorAggregate;
use LogicException;
use SplObjectStorage;
class Cluster extends Point implements IteratorAggregate
{
/**
* @var Space
*/
protected $space;
/**
* @var SplObjectStorage|Point[]
*/
protected $points;
public function __construct(Space $space, array $coordinates)
{
parent::__construct($coordinates);
$this->space = $space;
$this->points = new SplObjectStorage();
}
public function getPoints(): array
{
$points = [];
foreach ($this->points as $point) {
if ($point->label === null) {
$points[] = $point->toArray();
} else {
$points[$point->label] = $point->toArray();
}
}
return $points;
}
public function toArray(): array
{
return [
'centroid' => parent::toArray(),
'points' => $this->getPoints(),
];
}
public function attach(Point $point): Point
{
if ($point instanceof self) {
throw new LogicException('Cannot attach a cluster to another');
}
$this->points->attach($point);
return $point;
}
public function detach(Point $point): Point
{
$this->points->detach($point);
return $point;
}
public function attachAll(SplObjectStorage $points): void
{
$this->points->addAll($points);
}
public function detachAll(SplObjectStorage $points): void
{
$this->points->removeAll($points);
}
public function updateCentroid(): void
{
$count = count($this->points);
if ($count === 0) {
return;
}
$centroid = $this->space->newPoint(array_fill(0, $this->dimension, 0));
foreach ($this->points as $point) {
for ($n = 0; $n < $this->dimension; ++$n) {
$centroid->coordinates[$n] += $point->coordinates[$n];
}
}
for ($n = 0; $n < $this->dimension; ++$n) {
$this->coordinates[$n] = $centroid->coordinates[$n] / $count;
}
}
/**
* @return Point[]|SplObjectStorage
*/
public function getIterator()
{
return $this->points;
}
public function count(): int
{
return count($this->points);
}
public function setCoordinates(array $newCoordinates): void
{
$this->coordinates = $newCoordinates;
}
}
@@ -0,0 +1,125 @@
<?php
declare(strict_types=1);
namespace Phpml\Clustering\KMeans;
use ArrayAccess;
class Point implements ArrayAccess, \Countable
{
/**
* @var int
*/
protected $dimension;
/**
* @var array
*/
protected $coordinates = [];
/**
* @var mixed
*/
protected $label;
/**
* @param mixed $label
*/
public function __construct(array $coordinates, $label = null)
{
$this->dimension = count($coordinates);
$this->coordinates = $coordinates;
$this->label = $label;
}
public function toArray(): array
{
return $this->coordinates;
}
/**
* @return float|int
*/
public function getDistanceWith(self $point, bool $precise = true)
{
$distance = 0;
for ($n = 0; $n < $this->dimension; ++$n) {
$difference = $this->coordinates[$n] - $point->coordinates[$n];
$distance += $difference * $difference;
}
return $precise ? $distance ** .5 : $distance;
}
/**
* @param Point[] $points
*/
public function getClosest(array $points): ?self
{
$minPoint = null;
foreach ($points as $point) {
$distance = $this->getDistanceWith($point, false);
if (!isset($minDistance)) {
$minDistance = $distance;
$minPoint = $point;
continue;
}
if ($distance < $minDistance) {
$minDistance = $distance;
$minPoint = $point;
}
}
return $minPoint;
}
public function getCoordinates(): array
{
return $this->coordinates;
}
/**
* @param mixed $offset
*/
public function offsetExists($offset): bool
{
return isset($this->coordinates[$offset]);
}
/**
* @param mixed $offset
*
* @return mixed
*/
public function offsetGet($offset)
{
return $this->coordinates[$offset];
}
/**
* @param mixed $offset
* @param mixed $value
*/
public function offsetSet($offset, $value): void
{
$this->coordinates[$offset] = $value;
}
/**
* @param mixed $offset
*/
public function offsetUnset($offset): void
{
unset($this->coordinates[$offset]);
}
public function count(): int
{
return count($this->coordinates);
}
}
@@ -0,0 +1,263 @@
<?php
declare(strict_types=1);
namespace Phpml\Clustering\KMeans;
use InvalidArgumentException;
use LogicException;
use Phpml\Clustering\KMeans;
use SplObjectStorage;
class Space extends SplObjectStorage
{
/**
* @var int
*/
protected $dimension;
public function __construct(int $dimension)
{
if ($dimension < 1) {
throw new LogicException('a space dimension cannot be null or negative');
}
$this->dimension = $dimension;
}
public function toArray(): array
{
$points = [];
/** @var Point $point */
foreach ($this as $point) {
$points[] = $point->toArray();
}
return ['points' => $points];
}
/**
* @param mixed $label
*/
public function newPoint(array $coordinates, $label = null): Point
{
if (count($coordinates) !== $this->dimension) {
throw new LogicException('('.implode(',', $coordinates).') is not a point of this space');
}
return new Point($coordinates, $label);
}
/**
* @param mixed $label
* @param mixed $data
*/
public function addPoint(array $coordinates, $label = null, $data = null): void
{
$this->attach($this->newPoint($coordinates, $label), $data);
}
/**
* @param object $point
* @param mixed $data
*/
public function attach($point, $data = null): void
{
if (!$point instanceof Point) {
throw new InvalidArgumentException('can only attach points to spaces');
}
parent::attach($point, $data);
}
public function getDimension(): int
{
return $this->dimension;
}
/**
* @return array|bool
*/
public function getBoundaries()
{
if (count($this) === 0) {
return false;
}
$min = $this->newPoint(array_fill(0, $this->dimension, null));
$max = $this->newPoint(array_fill(0, $this->dimension, null));
/** @var Point $point */
foreach ($this as $point) {
for ($n = 0; $n < $this->dimension; ++$n) {
if ($min[$n] === null || $min[$n] > $point[$n]) {
$min[$n] = $point[$n];
}
if ($max[$n] === null || $max[$n] < $point[$n]) {
$max[$n] = $point[$n];
}
}
}
return [$min, $max];
}
public function getRandomPoint(Point $min, Point $max): Point
{
$point = $this->newPoint(array_fill(0, $this->dimension, null));
for ($n = 0; $n < $this->dimension; ++$n) {
$point[$n] = random_int($min[$n], $max[$n]);
}
return $point;
}
/**
* @return Cluster[]
*/
public function cluster(int $clustersNumber, int $initMethod = KMeans::INIT_RANDOM): array
{
$clusters = $this->initializeClusters($clustersNumber, $initMethod);
do {
} while (!$this->iterate($clusters));
return $clusters;
}
/**
* @return Cluster[]
*/
protected function initializeClusters(int $clustersNumber, int $initMethod): array
{
switch ($initMethod) {
case KMeans::INIT_RANDOM:
$clusters = $this->initializeRandomClusters($clustersNumber);
break;
case KMeans::INIT_KMEANS_PLUS_PLUS:
$clusters = $this->initializeKMPPClusters($clustersNumber);
break;
default:
return [];
}
$clusters[0]->attachAll($this);
return $clusters;
}
/**
* @param Cluster[] $clusters
*/
protected function iterate(array $clusters): bool
{
$convergence = true;
$attach = new SplObjectStorage();
$detach = new SplObjectStorage();
foreach ($clusters as $cluster) {
foreach ($cluster as $point) {
$closest = $point->getClosest($clusters);
if ($closest === null) {
continue;
}
if ($closest !== $cluster) {
$attach[$closest] ?? $attach[$closest] = new SplObjectStorage();
$detach[$cluster] ?? $detach[$cluster] = new SplObjectStorage();
$attach[$closest]->attach($point);
$detach[$cluster]->attach($point);
$convergence = false;
}
}
}
/** @var Cluster $cluster */
foreach ($attach as $cluster) {
$cluster->attachAll($attach[$cluster]);
}
/** @var Cluster $cluster */
foreach ($detach as $cluster) {
$cluster->detachAll($detach[$cluster]);
}
foreach ($clusters as $cluster) {
$cluster->updateCentroid();
}
return $convergence;
}
/**
* @return Cluster[]
*/
protected function initializeKMPPClusters(int $clustersNumber): array
{
$clusters = [];
$this->rewind();
/** @var Point $current */
$current = $this->current();
$clusters[] = new Cluster($this, $current->getCoordinates());
$distances = new SplObjectStorage();
for ($i = 1; $i < $clustersNumber; ++$i) {
$sum = 0;
/** @var Point $point */
foreach ($this as $point) {
$closest = $point->getClosest($clusters);
if ($closest === null) {
continue;
}
$distance = $point->getDistanceWith($closest);
$sum += $distances[$point] = $distance;
}
$sum = random_int(0, (int) $sum);
/** @var Point $point */
foreach ($this as $point) {
$sum -= $distances[$point];
if ($sum > 0) {
continue;
}
$clusters[] = new Cluster($this, $point->getCoordinates());
break;
}
}
return $clusters;
}
/**
* @return Cluster[]
*/
private function initializeRandomClusters(int $clustersNumber): array
{
$clusters = [];
[$min, $max] = $this->getBoundaries();
for ($n = 0; $n < $clustersNumber; ++$n) {
$clusters[] = new Cluster($this, $this->getRandomPoint($min, $max)->getCoordinates());
}
return $clusters;
}
}
@@ -0,0 +1,26 @@
<?php
declare(strict_types=1);
namespace Phpml\CrossValidation;
use Phpml\Dataset\Dataset;
class RandomSplit extends Split
{
protected function splitDataset(Dataset $dataset, float $testSize): void
{
$samples = $dataset->getSamples();
$labels = $dataset->getTargets();
$datasetSize = count($samples);
$testCount = count($this->testSamples);
for ($i = $datasetSize; $i > 0; --$i) {
$key = mt_rand(0, $datasetSize - 1);
$setName = (count($this->testSamples) - $testCount) / $datasetSize >= $testSize ? 'train' : 'test';
$this->{$setName.'Samples'}[] = $samples[$key];
$this->{$setName.'Labels'}[] = $labels[$key];
}
}
}
@@ -0,0 +1,73 @@
<?php
declare(strict_types=1);
namespace Phpml\CrossValidation;
use Phpml\Dataset\Dataset;
use Phpml\Exception\InvalidArgumentException;
abstract class Split
{
/**
* @var array
*/
protected $trainSamples = [];
/**
* @var array
*/
protected $testSamples = [];
/**
* @var array
*/
protected $trainLabels = [];
/**
* @var array
*/
protected $testLabels = [];
public function __construct(Dataset $dataset, float $testSize = 0.3, ?int $seed = null)
{
if ($testSize <= 0 || $testSize >= 1) {
throw new InvalidArgumentException('testsize must be between 0.0 and 1.0');
}
$this->seedGenerator($seed);
$this->splitDataset($dataset, $testSize);
}
public function getTrainSamples(): array
{
return $this->trainSamples;
}
public function getTestSamples(): array
{
return $this->testSamples;
}
public function getTrainLabels(): array
{
return $this->trainLabels;
}
public function getTestLabels(): array
{
return $this->testLabels;
}
abstract protected function splitDataset(Dataset $dataset, float $testSize): void;
protected function seedGenerator(?int $seed = null): void
{
if ($seed === null) {
mt_srand();
} else {
mt_srand($seed);
}
}
}
@@ -0,0 +1,49 @@
<?php
declare(strict_types=1);
namespace Phpml\CrossValidation;
use Phpml\Dataset\ArrayDataset;
use Phpml\Dataset\Dataset;
class StratifiedRandomSplit extends RandomSplit
{
protected function splitDataset(Dataset $dataset, float $testSize): void
{
$datasets = $this->splitByTarget($dataset);
foreach ($datasets as $targetSet) {
parent::splitDataset($targetSet, $testSize);
}
}
/**
* @return Dataset[]
*/
private function splitByTarget(Dataset $dataset): array
{
$targets = $dataset->getTargets();
$samples = $dataset->getSamples();
$uniqueTargets = array_unique($targets);
/** @var array $split */
$split = array_combine($uniqueTargets, array_fill(0, count($uniqueTargets), []));
foreach ($samples as $key => $sample) {
$split[$targets[$key]][] = $sample;
}
return $this->createDatasets($uniqueTargets, $split);
}
private function createDatasets(array $uniqueTargets, array $split): array
{
$datasets = [];
foreach ($uniqueTargets as $target) {
$datasets[$target] = new ArrayDataset($split[$target], array_fill(0, count($split[$target]), $target));
}
return $datasets;
}
}
@@ -0,0 +1,62 @@
<?php
declare(strict_types=1);
namespace Phpml\Dataset;
use Phpml\Exception\InvalidArgumentException;
class ArrayDataset implements Dataset
{
/**
* @var array
*/
protected $samples = [];
/**
* @var array
*/
protected $targets = [];
/**
* @throws InvalidArgumentException
*/
public function __construct(array $samples, array $targets)
{
if (count($samples) !== count($targets)) {
throw new InvalidArgumentException('Size of given arrays does not match');
}
$this->samples = $samples;
$this->targets = $targets;
}
public function getSamples(): array
{
return $this->samples;
}
public function getTargets(): array
{
return $this->targets;
}
/**
* @param int[] $columns
*/
public function removeColumns(array $columns): void
{
foreach ($this->samples as &$sample) {
$this->removeColumnsFromSample($sample, $columns);
}
}
private function removeColumnsFromSample(array &$sample, array $columns): void
{
foreach ($columns as $index) {
unset($sample[$index]);
}
$sample = array_values($sample);
}
}
@@ -0,0 +1,52 @@
<?php
declare(strict_types=1);
namespace Phpml\Dataset;
use Phpml\Exception\FileException;
class CsvDataset extends ArrayDataset
{
/**
* @var array
*/
protected $columnNames = [];
/**
* @throws FileException
*/
public function __construct(string $filepath, int $features, bool $headingRow = true, string $delimiter = ',', int $maxLineLength = 0)
{
if (!file_exists($filepath)) {
throw new FileException(sprintf('File "%s" missing.', basename($filepath)));
}
$handle = fopen($filepath, 'rb');
if ($handle === false) {
throw new FileException(sprintf('File "%s" can\'t be open.', basename($filepath)));
}
if ($headingRow) {
$data = fgetcsv($handle, $maxLineLength, $delimiter);
$this->columnNames = array_slice((array) $data, 0, $features);
} else {
$this->columnNames = range(0, $features - 1);
}
$samples = $targets = [];
while ($data = fgetcsv($handle, $maxLineLength, $delimiter)) {
$samples[] = array_slice($data, 0, $features);
$targets[] = $data[$features];
}
fclose($handle);
parent::__construct($samples, $targets);
}
public function getColumnNames(): array
{
return $this->columnNames;
}
}
@@ -0,0 +1,12 @@
<?php
declare(strict_types=1);
namespace Phpml\Dataset;
interface Dataset
{
public function getSamples(): array;
public function getTargets(): array;
}
@@ -0,0 +1,28 @@
<?php
declare(strict_types=1);
namespace Phpml\Dataset\Demo;
use Phpml\Dataset\CsvDataset;
/**
* Classes: 6
* Samples per class:
* 70 float processed building windows
* 17 float processed vehicle windows
* 76 non-float processed building windows
* 13 containers
* 9 tableware
* 29 headlamps
* Samples total: 214
* Features per sample: 9.
*/
class GlassDataset extends CsvDataset
{
public function __construct()
{
$filepath = __DIR__.'/../../../data/glass.csv';
parent::__construct($filepath, 9, true);
}
}
@@ -0,0 +1,22 @@
<?php
declare(strict_types=1);
namespace Phpml\Dataset\Demo;
use Phpml\Dataset\CsvDataset;
/**
* Classes: 3
* Samples per class: 50
* Samples total: 150
* Features per sample: 4.
*/
class IrisDataset extends CsvDataset
{
public function __construct()
{
$filepath = __DIR__.'/../../../data/iris.csv';
parent::__construct($filepath, 4, true);
}
}
@@ -0,0 +1,22 @@
<?php
declare(strict_types=1);
namespace Phpml\Dataset\Demo;
use Phpml\Dataset\CsvDataset;
/**
* Classes: 3
* Samples per class: class 1 59; class 2 71; class 3 48
* Samples total: 178
* Features per sample: 13.
*/
class WineDataset extends CsvDataset
{
public function __construct()
{
$filepath = __DIR__.'/../../../data/wine.csv';
parent::__construct($filepath, 13, true);
}
}
@@ -0,0 +1,47 @@
<?php
declare(strict_types=1);
namespace Phpml\Dataset;
use Phpml\Exception\DatasetException;
class FilesDataset extends ArrayDataset
{
public function __construct(string $rootPath)
{
if (!is_dir($rootPath)) {
throw new DatasetException(sprintf('Dataset root folder "%s" missing.', $rootPath));
}
$this->scanRootPath($rootPath);
}
private function scanRootPath(string $rootPath): void
{
$dirs = glob($rootPath.DIRECTORY_SEPARATOR.'*', GLOB_ONLYDIR);
if ($dirs === false) {
throw new DatasetException(sprintf('An error occurred during directory "%s" scan', $rootPath));
}
foreach ($dirs as $dir) {
$this->scanDir($dir);
}
}
private function scanDir(string $dir): void
{
$target = basename($dir);
$files = glob($dir.DIRECTORY_SEPARATOR.'*');
if ($files === false) {
return;
}
foreach (array_filter($files, 'is_file') as $file) {
$this->samples[] = file_get_contents($file);
$this->targets[] = $target;
}
}
}
@@ -0,0 +1,101 @@
<?php
declare(strict_types=1);
namespace Phpml\Dataset;
use Phpml\Exception\InvalidArgumentException;
/**
* MNIST dataset: http://yann.lecun.com/exdb/mnist/
* original mnist dataset reader: https://github.com/AndrewCarterUK/mnist-neural-network-plain-php
*/
final class MnistDataset extends ArrayDataset
{
private const MAGIC_IMAGE = 0x00000803;
private const MAGIC_LABEL = 0x00000801;
private const IMAGE_ROWS = 28;
private const IMAGE_COLS = 28;
public function __construct(string $imagePath, string $labelPath)
{
$this->samples = $this->readImages($imagePath);
$this->targets = $this->readLabels($labelPath);
if (count($this->samples) !== count($this->targets)) {
throw new InvalidArgumentException('Must have the same number of images and labels');
}
}
private function readImages(string $imagePath): array
{
$stream = fopen($imagePath, 'rb');
if ($stream === false) {
throw new InvalidArgumentException('Could not open file: '.$imagePath);
}
$images = [];
try {
$header = fread($stream, 16);
$fields = unpack('Nmagic/Nsize/Nrows/Ncols', (string) $header);
if ($fields['magic'] !== self::MAGIC_IMAGE) {
throw new InvalidArgumentException('Invalid magic number: '.$imagePath);
}
if ($fields['rows'] != self::IMAGE_ROWS) {
throw new InvalidArgumentException('Invalid number of image rows: '.$imagePath);
}
if ($fields['cols'] != self::IMAGE_COLS) {
throw new InvalidArgumentException('Invalid number of image cols: '.$imagePath);
}
for ($i = 0; $i < $fields['size']; $i++) {
$imageBytes = fread($stream, $fields['rows'] * $fields['cols']);
// Convert to float between 0 and 1
$images[] = array_map(function ($b) {
return $b / 255;
}, array_values(unpack('C*', (string) $imageBytes)));
}
} finally {
fclose($stream);
}
return $images;
}
private function readLabels(string $labelPath): array
{
$stream = fopen($labelPath, 'rb');
if ($stream === false) {
throw new InvalidArgumentException('Could not open file: '.$labelPath);
}
$labels = [];
try {
$header = fread($stream, 8);
$fields = unpack('Nmagic/Nsize', (string) $header);
if ($fields['magic'] !== self::MAGIC_LABEL) {
throw new InvalidArgumentException('Invalid magic number: '.$labelPath);
}
$labels = fread($stream, $fields['size']);
} finally {
fclose($stream);
}
return array_values(unpack('C*', (string) $labels));
}
}
@@ -0,0 +1,131 @@
<?php
declare(strict_types=1);
namespace Phpml\Dataset;
use Phpml\Exception\DatasetException;
use Phpml\Exception\FileException;
class SvmDataset extends ArrayDataset
{
public function __construct(string $filePath)
{
[$samples, $targets] = self::readProblem($filePath);
parent::__construct($samples, $targets);
}
private static function readProblem(string $filePath): array
{
$handle = self::openFile($filePath);
$samples = [];
$targets = [];
$maxIndex = 0;
while (false !== $line = fgets($handle)) {
[$sample, $target, $maxIndex] = self::processLine($line, $maxIndex);
$samples[] = $sample;
$targets[] = $target;
}
fclose($handle);
foreach ($samples as &$sample) {
$sample = array_pad($sample, $maxIndex + 1, 0);
}
return [$samples, $targets];
}
/**
* @return resource
*/
private static function openFile(string $filePath)
{
if (!file_exists($filePath)) {
throw new FileException(sprintf('File "%s" missing.', basename($filePath)));
}
$handle = fopen($filePath, 'rb');
if ($handle === false) {
throw new FileException(sprintf('File "%s" can\'t be open.', basename($filePath)));
}
return $handle;
}
private static function processLine(string $line, int $maxIndex): array
{
$columns = self::parseLine($line);
$target = self::parseTargetColumn($columns[0]);
$sample = array_fill(0, $maxIndex + 1, 0);
$n = count($columns);
for ($i = 1; $i < $n; ++$i) {
[$index, $value] = self::parseFeatureColumn($columns[$i]);
if ($index > $maxIndex) {
$maxIndex = $index;
$sample = array_pad($sample, $maxIndex + 1, 0);
}
$sample[$index] = $value;
}
return [$sample, $target, $maxIndex];
}
private static function parseLine(string $line): array
{
$line = explode('#', $line, 2)[0];
$line = rtrim($line);
$line = str_replace("\t", ' ', $line);
return explode(' ', $line);
}
private static function parseTargetColumn(string $column): float
{
if (!is_numeric($column)) {
throw new DatasetException(sprintf('Invalid target "%s".', $column));
}
return (float) $column;
}
private static function parseFeatureColumn(string $column): array
{
$feature = explode(':', $column, 2);
if (count($feature) !== 2) {
throw new DatasetException(sprintf('Invalid value "%s".', $column));
}
$index = self::parseFeatureIndex($feature[0]);
$value = self::parseFeatureValue($feature[1]);
return [$index, $value];
}
private static function parseFeatureIndex(string $index): int
{
if (!is_numeric($index) || !ctype_digit($index)) {
throw new DatasetException(sprintf('Invalid index "%s".', $index));
}
if ((int) $index < 1) {
throw new DatasetException(sprintf('Invalid index "%s".', $index));
}
return (int) $index - 1;
}
private static function parseFeatureValue(string $value): float
{
if (!is_numeric($value)) {
throw new DatasetException(sprintf('Invalid value "%s".', $value));
}
return (float) $value;
}
}
@@ -0,0 +1,94 @@
<?php
declare(strict_types=1);
namespace Phpml\DimensionReduction;
use Phpml\Math\LinearAlgebra\EigenvalueDecomposition;
use Phpml\Math\Matrix;
/**
* Class to compute eigen pairs (values & vectors) of a given matrix
* with the consideration of numFeatures or totalVariance to be preserved
*
* @author hp
*/
abstract class EigenTransformerBase
{
/**
* Total variance to be conserved after the reduction
*
* @var float
*/
public $totalVariance = 0.9;
/**
* Number of features to be preserved after the reduction
*
* @var int
*/
public $numFeatures = null;
/**
* Top eigenvectors of the matrix
*
* @var array
*/
protected $eigVectors = [];
/**
* Top eigenValues of the matrix
*
* @var array
*/
protected $eigValues = [];
/**
* Calculates eigenValues and eigenVectors of the given matrix. Returns
* top eigenVectors along with the largest eigenValues. The total explained variance
* of these eigenVectors will be no less than desired $totalVariance value
*/
protected function eigenDecomposition(array $matrix): void
{
$eig = new EigenvalueDecomposition($matrix);
$eigVals = $eig->getRealEigenvalues();
$eigVects = $eig->getEigenvectors();
$totalEigVal = array_sum($eigVals);
// Sort eigenvalues in descending order
arsort($eigVals);
$explainedVar = 0.0;
$vectors = [];
$values = [];
foreach ($eigVals as $i => $eigVal) {
$explainedVar += $eigVal / $totalEigVal;
$vectors[] = $eigVects[$i];
$values[] = $eigVal;
if ($this->numFeatures !== null) {
if (count($vectors) == $this->numFeatures) {
break;
}
} else {
if ($explainedVar >= $this->totalVariance) {
break;
}
}
}
$this->eigValues = $values;
$this->eigVectors = $vectors;
}
/**
* Returns the reduced data
*/
protected function reduce(array $data): array
{
$m1 = new Matrix($data);
$m2 = new Matrix($this->eigVectors);
return $m1->multiply($m2->transpose())->toArray();
}
}
@@ -0,0 +1,234 @@
<?php
declare(strict_types=1);
namespace Phpml\DimensionReduction;
use Closure;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Exception\InvalidOperationException;
use Phpml\Math\Distance\Euclidean;
use Phpml\Math\Distance\Manhattan;
use Phpml\Math\Matrix;
class KernelPCA extends PCA
{
public const KERNEL_RBF = 1;
public const KERNEL_SIGMOID = 2;
public const KERNEL_LAPLACIAN = 3;
public const KERNEL_LINEAR = 4;
/**
* Selected kernel function
*
* @var int
*/
protected $kernel;
/**
* Gamma value used by the kernel
*
* @var float|null
*/
protected $gamma;
/**
* Original dataset used to fit KernelPCA
*
* @var array
*/
protected $data = [];
/**
* Kernel principal component analysis (KernelPCA) is an extension of PCA using
* techniques of kernel methods. It is more suitable for data that involves
* vectors that are not linearly separable<br><br>
* Example: <b>$kpca = new KernelPCA(KernelPCA::KERNEL_RBF, null, 2, 15.0);</b>
* will initialize the algorithm with an RBF kernel having the gamma parameter as 15,0. <br>
* This transformation will return the same number of rows with only <i>2</i> columns.
*
* @param float $totalVariance Total variance to be preserved if numFeatures is not given
* @param int $numFeatures Number of columns to be returned
* @param float $gamma Gamma parameter is used with RBF and Sigmoid kernels
*
* @throws InvalidArgumentException
*/
public function __construct(int $kernel = self::KERNEL_RBF, ?float $totalVariance = null, ?int $numFeatures = null, ?float $gamma = null)
{
if (!in_array($kernel, [self::KERNEL_RBF, self::KERNEL_SIGMOID, self::KERNEL_LAPLACIAN, self::KERNEL_LINEAR], true)) {
throw new InvalidArgumentException('KernelPCA can be initialized with the following kernels only: Linear, RBF, Sigmoid and Laplacian');
}
parent::__construct($totalVariance, $numFeatures);
$this->kernel = $kernel;
$this->gamma = $gamma;
}
/**
* Takes a data and returns a lower dimensional version
* of this data while preserving $totalVariance or $numFeatures. <br>
* $data is an n-by-m matrix and returned array is
* n-by-k matrix where k <= m
*/
public function fit(array $data): array
{
$numRows = count($data);
$this->data = $data;
if ($this->gamma === null) {
$this->gamma = 1.0 / $numRows;
}
$matrix = $this->calculateKernelMatrix($this->data, $numRows);
$matrix = $this->centerMatrix($matrix, $numRows);
$this->eigenDecomposition($matrix);
$this->fit = true;
return Matrix::transposeArray($this->eigVectors);
}
/**
* Transforms the given sample to a lower dimensional vector by using
* the variables obtained during the last run of <code>fit</code>.
*
* @throws InvalidArgumentException
* @throws InvalidOperationException
*/
public function transform(array $sample): array
{
if (!$this->fit) {
throw new InvalidOperationException('KernelPCA has not been fitted with respect to original dataset, please run KernelPCA::fit() first');
}
if (is_array($sample[0])) {
throw new InvalidArgumentException('KernelPCA::transform() accepts only one-dimensional arrays');
}
$pairs = $this->getDistancePairs($sample);
return $this->projectSample($pairs);
}
/**
* Calculates similarity matrix by use of selected kernel function<br>
* An n-by-m matrix is given and an n-by-n matrix is returned
*/
protected function calculateKernelMatrix(array $data, int $numRows): array
{
$kernelFunc = $this->getKernel();
$matrix = [];
for ($i = 0; $i < $numRows; ++$i) {
for ($k = 0; $k < $numRows; ++$k) {
if ($i <= $k) {
$matrix[$i][$k] = $kernelFunc($data[$i], $data[$k]);
} else {
$matrix[$i][$k] = $matrix[$k][$i];
}
}
}
return $matrix;
}
/**
* Kernel matrix is centered in its original space by using the following
* conversion:
*
* K = K N.K K.N + N.K.N where N is n-by-n matrix filled with 1/n
*/
protected function centerMatrix(array $matrix, int $n): array
{
$N = array_fill(0, $n, array_fill(0, $n, 1.0 / $n));
$N = new Matrix($N, false);
$K = new Matrix($matrix, false);
// K.N (This term is repeated so we cache it once)
$K_N = $K->multiply($N);
// N.K
$N_K = $N->multiply($K);
// N.K.N
$N_K_N = $N->multiply($K_N);
return $K->subtract($N_K)
->subtract($K_N)
->add($N_K_N)
->toArray();
}
/**
* Returns the callable kernel function
*
* @throws \Exception
*/
protected function getKernel(): Closure
{
switch ($this->kernel) {
case self::KERNEL_LINEAR:
// k(x,y) = xT.y
return function ($x, $y) {
return Matrix::dot($x, $y)[0];
};
case self::KERNEL_RBF:
// k(x,y)=exp(-γ.|x-y|) where |..| is Euclidean distance
$dist = new Euclidean();
return function ($x, $y) use ($dist): float {
return exp(-$this->gamma * $dist->sqDistance($x, $y));
};
case self::KERNEL_SIGMOID:
// k(x,y)=tanh(γ.xT.y+c0) where c0=1
return function ($x, $y): float {
$res = Matrix::dot($x, $y)[0] + 1.0;
return tanh((float) $this->gamma * $res);
};
case self::KERNEL_LAPLACIAN:
// k(x,y)=exp(-γ.|x-y|) where |..| is Manhattan distance
$dist = new Manhattan();
return function ($x, $y) use ($dist): float {
return exp(-$this->gamma * $dist->distance($x, $y));
};
default:
// Not reached
throw new InvalidArgumentException(sprintf('KernelPCA initialized with invalid kernel: %d', $this->kernel));
}
}
protected function getDistancePairs(array $sample): array
{
$kernel = $this->getKernel();
$pairs = [];
foreach ($this->data as $row) {
$pairs[] = $kernel($row, $sample);
}
return $pairs;
}
protected function projectSample(array $pairs): array
{
// Normalize eigenvectors by eig = eigVectors / eigValues
$func = function ($eigVal, $eigVect) {
$m = new Matrix($eigVect, false);
$a = $m->divideByScalar($eigVal)->toArray();
return $a[0];
};
$eig = array_map($func, $this->eigValues, $this->eigVectors);
// return k.dot(eig)
return Matrix::dot($pairs, $eig);
}
}
@@ -0,0 +1,223 @@
<?php
declare(strict_types=1);
namespace Phpml\DimensionReduction;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Exception\InvalidOperationException;
use Phpml\Math\Matrix;
class LDA extends EigenTransformerBase
{
/**
* @var bool
*/
public $fit = false;
/**
* @var array
*/
public $labels = [];
/**
* @var array
*/
public $means = [];
/**
* @var array
*/
public $counts = [];
/**
* @var float[]
*/
public $overallMean = [];
/**
* Linear Discriminant Analysis (LDA) is used to reduce the dimensionality
* of the data. Unlike Principal Component Analysis (PCA), it is a supervised
* technique that requires the class labels in order to fit the data to a
* lower dimensional space. <br><br>
* The algorithm can be initialized by speciyfing
* either with the totalVariance(a value between 0.1 and 0.99)
* or numFeatures (number of features in the dataset) to be preserved.
*
* @param float|null $totalVariance Total explained variance to be preserved
* @param int|null $numFeatures Number of features to be preserved
*
* @throws InvalidArgumentException
*/
public function __construct(?float $totalVariance = null, ?int $numFeatures = null)
{
if ($totalVariance !== null && ($totalVariance < 0.1 || $totalVariance > 0.99)) {
throw new InvalidArgumentException('Total variance can be a value between 0.1 and 0.99');
}
if ($numFeatures !== null && $numFeatures <= 0) {
throw new InvalidArgumentException('Number of features to be preserved should be greater than 0');
}
if (($totalVariance !== null) === ($numFeatures !== null)) {
throw new InvalidArgumentException('Either totalVariance or numFeatures should be specified in order to run the algorithm');
}
if ($numFeatures !== null) {
$this->numFeatures = $numFeatures;
}
if ($totalVariance !== null) {
$this->totalVariance = $totalVariance;
}
}
/**
* Trains the algorithm to transform the given data to a lower dimensional space.
*/
public function fit(array $data, array $classes): array
{
$this->labels = $this->getLabels($classes);
$this->means = $this->calculateMeans($data, $classes);
$sW = $this->calculateClassVar($data, $classes);
$sB = $this->calculateClassCov();
$S = $sW->inverse()->multiply($sB);
$this->eigenDecomposition($S->toArray());
$this->fit = true;
return $this->reduce($data);
}
/**
* Transforms the given sample to a lower dimensional vector by using
* the eigenVectors obtained in the last run of <code>fit</code>.
*
* @throws InvalidOperationException
*/
public function transform(array $sample): array
{
if (!$this->fit) {
throw new InvalidOperationException('LDA has not been fitted with respect to original dataset, please run LDA::fit() first');
}
if (!is_array($sample[0])) {
$sample = [$sample];
}
return $this->reduce($sample);
}
/**
* Returns unique labels in the dataset
*/
protected function getLabels(array $classes): array
{
$counts = array_count_values($classes);
return array_keys($counts);
}
/**
* Calculates mean of each column for each class and returns
* n by m matrix where n is number of labels and m is number of columns
*/
protected function calculateMeans(array $data, array $classes): array
{
$means = [];
$counts = [];
$overallMean = array_fill(0, count($data[0]), 0.0);
foreach ($data as $index => $row) {
$label = array_search($classes[$index], $this->labels, true);
foreach ($row as $col => $val) {
if (!isset($means[$label][$col])) {
$means[$label][$col] = 0.0;
}
$means[$label][$col] += $val;
$overallMean[$col] += $val;
}
if (!isset($counts[$label])) {
$counts[$label] = 0;
}
++$counts[$label];
}
foreach ($means as $index => $row) {
foreach ($row as $col => $sum) {
$means[$index][$col] = $sum / $counts[$index];
}
}
// Calculate overall mean of the dataset for each column
$numElements = array_sum($counts);
$map = function ($el) use ($numElements) {
return $el / $numElements;
};
$this->overallMean = array_map($map, $overallMean);
$this->counts = $counts;
return $means;
}
/**
* Returns in-class scatter matrix for each class, which
* is a n by m matrix where n is number of classes and
* m is number of columns
*/
protected function calculateClassVar(array $data, array $classes): Matrix
{
// s is an n (number of classes) by m (number of column) matrix
$s = array_fill(0, count($data[0]), array_fill(0, count($data[0]), 0));
$sW = new Matrix($s, false);
foreach ($data as $index => $row) {
$label = array_search($classes[$index], $this->labels, true);
$means = $this->means[$label];
$row = $this->calculateVar($row, $means);
$sW = $sW->add($row);
}
return $sW;
}
/**
* Returns between-class scatter matrix for each class, which
* is an n by m matrix where n is number of classes and
* m is number of columns
*/
protected function calculateClassCov(): Matrix
{
// s is an n (number of classes) by m (number of column) matrix
$s = array_fill(0, count($this->overallMean), array_fill(0, count($this->overallMean), 0));
$sB = new Matrix($s, false);
foreach ($this->means as $index => $classMeans) {
$row = $this->calculateVar($classMeans, $this->overallMean);
$N = $this->counts[$index];
$sB = $sB->add($row->multiplyByScalar($N));
}
return $sB;
}
/**
* Returns the result of the calculation (x - m)T.(x - m)
*/
protected function calculateVar(array $row, array $means): Matrix
{
$x = new Matrix($row, false);
$m = new Matrix($means, false);
$diff = $x->subtract($m);
return $diff->transpose()->multiply($diff);
}
}
@@ -0,0 +1,131 @@
<?php
declare(strict_types=1);
namespace Phpml\DimensionReduction;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Exception\InvalidOperationException;
use Phpml\Math\Statistic\Covariance;
use Phpml\Math\Statistic\Mean;
class PCA extends EigenTransformerBase
{
/**
* Temporary storage for mean values for each dimension in given data
*
* @var array
*/
protected $means = [];
/**
* @var bool
*/
protected $fit = false;
/**
* PCA (Principal Component Analysis) used to explain given
* data with lower number of dimensions. This analysis transforms the
* data to a lower dimensional version of it by conserving a proportion of total variance
* within the data. It is a lossy data compression technique.<br>
*
* @param float $totalVariance Total explained variance to be preserved
* @param int $numFeatures Number of features to be preserved
*
* @throws InvalidArgumentException
*/
public function __construct(?float $totalVariance = null, ?int $numFeatures = null)
{
if ($totalVariance !== null && ($totalVariance < 0.1 || $totalVariance > 0.99)) {
throw new InvalidArgumentException('Total variance can be a value between 0.1 and 0.99');
}
if ($numFeatures !== null && $numFeatures <= 0) {
throw new InvalidArgumentException('Number of features to be preserved should be greater than 0');
}
if (($totalVariance !== null) === ($numFeatures !== null)) {
throw new InvalidArgumentException('Either totalVariance or numFeatures should be specified in order to run the algorithm');
}
if ($numFeatures !== null) {
$this->numFeatures = $numFeatures;
}
if ($totalVariance !== null) {
$this->totalVariance = $totalVariance;
}
}
/**
* Takes a data and returns a lower dimensional version
* of this data while preserving $totalVariance or $numFeatures. <br>
* $data is an n-by-m matrix and returned array is
* n-by-k matrix where k <= m
*/
public function fit(array $data): array
{
$n = count($data[0]);
$data = $this->normalize($data, $n);
$covMatrix = Covariance::covarianceMatrix($data, array_fill(0, $n, 0));
$this->eigenDecomposition($covMatrix);
$this->fit = true;
return $this->reduce($data);
}
/**
* Transforms the given sample to a lower dimensional vector by using
* the eigenVectors obtained in the last run of <code>fit</code>.
*
* @throws InvalidOperationException
*/
public function transform(array $sample): array
{
if (!$this->fit) {
throw new InvalidOperationException('PCA has not been fitted with respect to original dataset, please run PCA::fit() first');
}
if (!is_array($sample[0])) {
$sample = [$sample];
}
$sample = $this->normalize($sample, count($sample[0]));
return $this->reduce($sample);
}
protected function calculateMeans(array $data, int $n): void
{
// Calculate means for each dimension
$this->means = [];
for ($i = 0; $i < $n; ++$i) {
$column = array_column($data, $i);
$this->means[] = Mean::arithmetic($column);
}
}
/**
* Normalization of the data includes subtracting mean from
* each dimension therefore dimensions will be centered to zero
*/
protected function normalize(array $data, int $n): array
{
if (count($this->means) === 0) {
$this->calculateMeans($data, $n);
}
// Normalize data
foreach (array_keys($data) as $i) {
for ($k = 0; $k < $n; ++$k) {
$data[$i][$k] -= $this->means[$k];
}
}
return $data;
}
}
@@ -0,0 +1,15 @@
<?php
declare(strict_types=1);
namespace Phpml;
interface Estimator
{
public function train(array $samples, array $targets): void;
/**
* @return mixed
*/
public function predict(array $samples);
}
@@ -0,0 +1,11 @@
<?php
declare(strict_types=1);
namespace Phpml\Exception;
use Exception;
class DatasetException extends Exception
{
}
@@ -0,0 +1,11 @@
<?php
declare(strict_types=1);
namespace Phpml\Exception;
use Exception;
class FileException extends Exception
{
}
@@ -0,0 +1,11 @@
<?php
declare(strict_types=1);
namespace Phpml\Exception;
use Exception;
class InvalidArgumentException extends Exception
{
}
@@ -0,0 +1,11 @@
<?php
declare(strict_types=1);
namespace Phpml\Exception;
use Exception;
class InvalidOperationException extends Exception
{
}
@@ -0,0 +1,11 @@
<?php
declare(strict_types=1);
namespace Phpml\Exception;
use Exception;
class LibsvmCommandException extends Exception
{
}
@@ -0,0 +1,11 @@
<?php
declare(strict_types=1);
namespace Phpml\Exception;
use Exception;
class MatrixException extends Exception
{
}
@@ -0,0 +1,11 @@
<?php
declare(strict_types=1);
namespace Phpml\Exception;
use Exception;
class NormalizerException extends Exception
{
}
@@ -0,0 +1,11 @@
<?php
declare(strict_types=1);
namespace Phpml\Exception;
use Exception;
class SerializeException extends Exception
{
}
@@ -0,0 +1,36 @@
<?php
declare(strict_types=1);
namespace Phpml\FeatureExtraction;
use Phpml\Exception\InvalidArgumentException;
class StopWords
{
/**
* @var array
*/
protected $stopWords = [];
public function __construct(array $stopWords)
{
$this->stopWords = array_fill_keys($stopWords, true);
}
public function isStopWord(string $token): bool
{
return isset($this->stopWords[$token]);
}
public static function factory(string $language = 'English'): self
{
$className = __NAMESPACE__."\\StopWords\\{$language}";
if (!class_exists($className)) {
throw new InvalidArgumentException(sprintf('Can\'t find "%s" language for StopWords', $language));
}
return new $className();
}
}
@@ -0,0 +1,33 @@
<?php
declare(strict_types=1);
namespace Phpml\FeatureExtraction\StopWords;
use Phpml\FeatureExtraction\StopWords;
final class English extends StopWords
{
/**
* @var array
*/
protected $stopWords = [
'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'aren\'t', 'as', 'at', 'be', 'because',
'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can\'t', 'cannot', 'could', 'couldn\'t', 'did', 'didn\'t',
'do', 'does', 'doesn\'t', 'doing', 'don\'t', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn\'t', 'has',
'hasn\'t', 'have', 'haven\'t', 'having', 'he', 'he\'d', 'he\'ll', 'he\'s', 'her', 'here', 'here\'s', 'hers', 'herself', 'him',
'himself', 'his', 'how', 'how\'s', 'i', 'i\'d', 'i\'ll', 'i\'m', 'i\'ve', 'if', 'in', 'into', 'is', 'isn\'t', 'it', 'it\'s', 'its',
'itself', 'let\'s', 'me', 'more', 'most', 'mustn\'t', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or',
'other', 'ought', 'our', 'oursourselves', 'out', 'over', 'own', 'same', 'shan\'t', 'she', 'she\'d', 'she\'ll', 'she\'s', 'should',
'shouldn\'t', 'so', 'some', 'such', 'than', 'that', 'that\'s', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there',
'there\'s', 'these', 'they', 'they\'d', 'they\'ll', 'they\'re', 'they\'ve', 'this', 'those', 'through', 'to', 'too', 'under',
'until', 'up', 'very', 'was', 'wasn\'t', 'we', 'we\'d', 'we\'ll', 'we\'re', 'we\'ve', 'were', 'weren\'t', 'what', 'what\'s',
'when', 'when\'s', 'where', 'where\'s', 'which', 'while', 'who', 'who\'s', 'whom', 'why', 'why\'s', 'with', 'won\'t', 'would',
'wouldn\'t', 'you', 'you\'d', 'you\'ll', 'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves',
];
public function __construct()
{
parent::__construct($this->stopWords);
}
}
@@ -0,0 +1,29 @@
<?php
declare(strict_types=1);
namespace Phpml\FeatureExtraction\StopWords;
use Phpml\FeatureExtraction\StopWords;
final class French extends StopWords
{
/**
* @var array
*/
protected $stopWords = [
'alors', 'au', 'aucuns', 'aussi', 'autre', 'avant', 'avec', 'avoir', 'bon', 'car', 'ce', 'cela', 'ces', 'ceux', 'chaque', 'ci',
'comme', 'comment', 'dans', 'des', 'du', 'dedans', 'dehors', 'depuis', 'devrait', 'doit', 'donc', 'dos', 'début', 'elle', 'elles',
'en', 'encore', 'essai', 'est', 'et', 'eu', 'fait', 'faites', 'fois', 'font', 'hors', 'ici', 'il', 'ils', 'je', 'juste', 'la',
'le', 'les', 'leur', 'là', 'ma', 'maintenant', 'mais', 'mes', 'mine', 'moins', 'mon', 'mot', 'même', 'ni', 'nommés', 'notre',
'nous', 'ou', 'où', 'par', 'parce', 'pas', 'peut', 'peu', 'plupart', 'pour', 'pourquoi', 'quand', 'que', 'quel', 'quelle',
'quelles', 'quels', 'qui', 'sa', 'sans', 'ses', 'seulement', 'si', 'sien', 'son', 'sont', 'sous', 'soyez', 'sujet', 'sur', 'ta',
'tandis', 'tellement', 'tels', 'tes', 'ton', 'tous', 'tout', 'trop', 'très', 'tu', 'voient', 'vont', 'votre', 'vous', 'vu',
'ça', 'étaient', 'état', 'étions', 'été', 'être',
];
public function __construct()
{
parent::__construct($this->stopWords);
}
}
@@ -0,0 +1,30 @@
<?php
declare(strict_types=1);
namespace Phpml\FeatureExtraction\StopWords;
use Phpml\FeatureExtraction\StopWords;
final class Polish extends StopWords
{
/**
* @var array
*/
protected $stopWords = [
'ach', 'aj', 'albo', 'bardzo', 'bez', 'bo', 'być', 'ci', 'cię', 'ciebie', 'co', 'czy', 'daleko', 'dla', 'dlaczego', 'dlatego',
'do', 'dobrze', 'dokąd', 'dość', 'dużo', 'dwa', 'dwaj', 'dwie', 'dwoje', 'dziś', 'dzisiaj', 'gdyby', 'gdzie', 'go', 'ich', 'ile',
'im', 'inny', 'ja', 'ją', 'jak', 'jakby', 'jaki', 'je', 'jeden', 'jedna', 'jedno', 'jego', 'jej', 'jemu', 'jeśli', 'jest', 'jestem',
'jeżeli', 'już', 'każdy', 'kiedy', 'kierunku', 'kto', 'ku', 'lub', 'ma', 'mają', 'mam', 'mi', 'mną', 'mnie', 'moi', 'mój', 'moja',
'moje', 'może', 'mu', 'my', 'na', 'nam', 'nami', 'nas', 'nasi', 'nasz', 'nasza', 'nasze', 'natychmiast', 'nią', 'nic', 'nich',
'nie', 'niego', 'niej', 'niemu', 'nigdy', 'nim', 'nimi', 'niż', 'obok', 'od', 'około', 'on', 'ona', 'one', 'oni', 'ono', 'owszem',
'po', 'pod', 'ponieważ', 'przed', 'przedtem', 'są', 'sam', 'sama', 'się', 'skąd', 'tak', 'taki', 'tam', 'ten', 'to', 'tobą', 'tobie',
'tu', 'tutaj', 'twoi', 'twój', 'twoja', 'twoje', 'ty', 'wam', 'wami', 'was', 'wasi', 'wasz', 'wasza', 'wasze', 'we', 'więc',
'wszystko', 'wtedy', 'wy', 'żaden', 'zawsze', 'że',
];
public function __construct()
{
parent::__construct($this->stopWords);
}
}
@@ -0,0 +1,30 @@
<?php
declare(strict_types=1);
namespace Phpml\FeatureExtraction\StopWords;
use Phpml\FeatureExtraction\StopWords;
final class Russian extends StopWords
{
/**
* @var array
*/
protected $stopWords = [
'и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 'но', 'да', 'ты', 'к', 'у',
'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне', 'было', 'вот', 'от', 'меня', 'еще', 'нет', 'о', 'из', 'ему', 'теперь', 'когда',
'даже', 'ну', 'вдруг', 'ли', 'если', 'уже', 'или', 'ни', 'быть', 'был', 'него', 'до', 'вас', 'нибудь', 'опять', 'уж', 'вам', 'ведь',
'там', 'потом', 'себя', 'ничего', 'ей', 'может', 'они', 'тут', 'где', 'есть', 'надо', 'ней', 'для', 'мы', 'тебя', 'их', 'чем', 'была',
'сам', 'чтоб', 'без', 'будто', 'чего', 'раз', 'тоже', 'себе', 'под', 'будет', 'ж', 'тогда', 'кто', 'этот', 'того', 'потому', 'этого',
'какой', 'совсем', 'ним', 'здесь', 'этом', 'один', 'почти', 'мой', 'тем', 'чтобы', 'нее', 'сейчас', 'были', 'куда', 'зачем', 'всех',
'никогда', 'можно', 'при', 'наконец', 'два', 'об', 'другой', 'хоть', 'после', 'над', 'больше', 'тот', 'через', 'эти', 'нас', 'про',
'всего', 'них', 'какая', 'много', 'разве', 'три', 'эту', 'моя', 'впрочем', 'хорошо', 'свою', 'этой', 'перед', 'иногда', 'лучше', 'чуть',
'том', 'нельзя', 'такой', 'им', 'более', 'всегда', 'конечно', 'всю', 'между',
];
public function __construct()
{
parent::__construct($this->stopWords);
}
}
@@ -0,0 +1,54 @@
<?php
declare(strict_types=1);
namespace Phpml\FeatureExtraction;
use Phpml\Transformer;
class TfIdfTransformer implements Transformer
{
/**
* @var array
*/
private $idf = [];
public function __construct(array $samples = [])
{
if (count($samples) > 0) {
$this->fit($samples);
}
}
public function fit(array $samples, ?array $targets = null): void
{
$this->countTokensFrequency($samples);
$count = count($samples);
foreach ($this->idf as &$value) {
$value = log((float) ($count / $value), 10.0);
}
}
public function transform(array &$samples, ?array &$targets = null): void
{
foreach ($samples as &$sample) {
foreach ($sample as $index => &$feature) {
$feature *= $this->idf[$index];
}
}
}
private function countTokensFrequency(array $samples): void
{
$this->idf = array_fill_keys(array_keys($samples[0]), 0);
foreach ($samples as $sample) {
foreach ($sample as $index => $count) {
if ($count > 0) {
++$this->idf[$index];
}
}
}
}
}
@@ -0,0 +1,166 @@
<?php
declare(strict_types=1);
namespace Phpml\FeatureExtraction;
use Phpml\Tokenization\Tokenizer;
use Phpml\Transformer;
class TokenCountVectorizer implements Transformer
{
/**
* @var Tokenizer
*/
private $tokenizer;
/**
* @var StopWords|null
*/
private $stopWords;
/**
* @var float
*/
private $minDF;
/**
* @var array
*/
private $vocabulary = [];
/**
* @var array
*/
private $frequencies = [];
public function __construct(Tokenizer $tokenizer, ?StopWords $stopWords = null, float $minDF = 0.0)
{
$this->tokenizer = $tokenizer;
$this->stopWords = $stopWords;
$this->minDF = $minDF;
}
public function fit(array $samples, ?array $targets = null): void
{
$this->buildVocabulary($samples);
}
public function transform(array &$samples, ?array &$targets = null): void
{
array_walk($samples, function (string &$sample): void {
$this->transformSample($sample);
});
$this->checkDocumentFrequency($samples);
}
public function getVocabulary(): array
{
return array_flip($this->vocabulary);
}
private function buildVocabulary(array &$samples): void
{
foreach ($samples as $sample) {
$tokens = $this->tokenizer->tokenize($sample);
foreach ($tokens as $token) {
$this->addTokenToVocabulary($token);
}
}
}
private function transformSample(string &$sample): void
{
$counts = [];
$tokens = $this->tokenizer->tokenize($sample);
foreach ($tokens as $token) {
$index = $this->getTokenIndex($token);
if ($index !== false) {
$this->updateFrequency($token);
if (!isset($counts[$index])) {
$counts[$index] = 0;
}
++$counts[$index];
}
}
foreach ($this->vocabulary as $index) {
if (!isset($counts[$index])) {
$counts[$index] = 0;
}
}
ksort($counts);
$sample = $counts;
}
/**
* @return int|bool
*/
private function getTokenIndex(string $token)
{
if ($this->isStopWord($token)) {
return false;
}
return $this->vocabulary[$token] ?? false;
}
private function addTokenToVocabulary(string $token): void
{
if ($this->isStopWord($token)) {
return;
}
if (!isset($this->vocabulary[$token])) {
$this->vocabulary[$token] = count($this->vocabulary);
}
}
private function isStopWord(string $token): bool
{
return $this->stopWords !== null && $this->stopWords->isStopWord($token);
}
private function updateFrequency(string $token): void
{
if (!isset($this->frequencies[$token])) {
$this->frequencies[$token] = 0;
}
++$this->frequencies[$token];
}
private function checkDocumentFrequency(array &$samples): void
{
if ($this->minDF > 0) {
$beyondMinimum = $this->getBeyondMinimumIndexes(count($samples));
foreach ($samples as &$sample) {
$this->resetBeyondMinimum($sample, $beyondMinimum);
}
}
}
private function resetBeyondMinimum(array &$sample, array $beyondMinimum): void
{
foreach ($beyondMinimum as $index) {
$sample[$index] = 0;
}
}
private function getBeyondMinimumIndexes(int $samplesCount): array
{
$indexes = [];
foreach ($this->frequencies as $token => $frequency) {
if (($frequency / $samplesCount) < $this->minDF) {
$indexes[] = $this->getTokenIndex((string) $token);
}
}
return $indexes;
}
}
@@ -0,0 +1,10 @@
<?php
declare(strict_types=1);
namespace Phpml\FeatureSelection;
interface ScoringFunction
{
public function score(array $samples, array $targets): array;
}
@@ -0,0 +1,21 @@
<?php
declare(strict_types=1);
namespace Phpml\FeatureSelection\ScoringFunction;
use Phpml\FeatureSelection\ScoringFunction;
use Phpml\Math\Statistic\ANOVA;
final class ANOVAFValue implements ScoringFunction
{
public function score(array $samples, array $targets): array
{
$grouped = [];
foreach ($samples as $index => $sample) {
$grouped[$targets[$index]][] = $sample;
}
return ANOVA::oneWayF(array_values($grouped));
}
}
@@ -0,0 +1,81 @@
<?php
declare(strict_types=1);
namespace Phpml\FeatureSelection\ScoringFunction;
use Phpml\FeatureSelection\ScoringFunction;
use Phpml\Math\Matrix;
use Phpml\Math\Statistic\Mean;
/**
* Quick linear model for testing the effect of a single regressor,
* sequentially for many regressors.
*
* This is done in 2 steps:
*
* 1. The cross correlation between each regressor and the target is computed,
* that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *std(y)).
* 2. It is converted to an F score.
*
* Ported from scikit-learn f_regression function (http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html#sklearn.feature_selection.f_regression)
*/
final class UnivariateLinearRegression implements ScoringFunction
{
/**
* @var bool
*/
private $center;
/**
* @param bool $center - if true samples and targets will be centered
*/
public function __construct(bool $center = true)
{
$this->center = $center;
}
public function score(array $samples, array $targets): array
{
if ($this->center) {
$this->centerTargets($targets);
$this->centerSamples($samples);
}
$correlations = [];
foreach (array_keys($samples[0]) as $index) {
$featureColumn = array_column($samples, $index);
$correlations[$index] =
Matrix::dot($targets, $featureColumn)[0] / (new Matrix($featureColumn, false))->transpose()->frobeniusNorm()
/ (new Matrix($targets, false))->frobeniusNorm();
}
$degreesOfFreedom = count($targets) - ($this->center ? 2 : 1);
return array_map(function (float $correlation) use ($degreesOfFreedom): float {
return $correlation ** 2 / (1 - $correlation ** 2) * $degreesOfFreedom;
}, $correlations);
}
private function centerTargets(array &$targets): void
{
$mean = Mean::arithmetic($targets);
array_walk($targets, function (&$target) use ($mean): void {
$target -= $mean;
});
}
private function centerSamples(array &$samples): void
{
$means = [];
foreach ($samples[0] as $index => $feature) {
$means[$index] = Mean::arithmetic(array_column($samples, $index));
}
foreach ($samples as &$sample) {
foreach ($sample as $index => &$feature) {
$feature -= $means[$index];
}
}
}
}
@@ -0,0 +1,78 @@
<?php
declare(strict_types=1);
namespace Phpml\FeatureSelection;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Exception\InvalidOperationException;
use Phpml\FeatureSelection\ScoringFunction\ANOVAFValue;
use Phpml\Transformer;
final class SelectKBest implements Transformer
{
/**
* @var ScoringFunction
*/
private $scoringFunction;
/**
* @var int
*/
private $k;
/**
* @var array|null
*/
private $scores = null;
/**
* @var array|null
*/
private $keepColumns = null;
public function __construct(int $k = 10, ?ScoringFunction $scoringFunction = null)
{
if ($scoringFunction === null) {
$scoringFunction = new ANOVAFValue();
}
$this->scoringFunction = $scoringFunction;
$this->k = $k;
}
public function fit(array $samples, ?array $targets = null): void
{
if ($targets === null || count($targets) === 0) {
throw new InvalidArgumentException('The array has zero elements');
}
$this->scores = $sorted = $this->scoringFunction->score($samples, $targets);
if ($this->k >= count($sorted)) {
return;
}
arsort($sorted);
$this->keepColumns = array_slice($sorted, 0, $this->k, true);
}
public function transform(array &$samples, ?array &$targets = null): void
{
if ($this->keepColumns === null) {
return;
}
foreach ($samples as &$sample) {
$sample = array_values(array_intersect_key($sample, $this->keepColumns));
}
}
public function scores(): array
{
if ($this->scores === null) {
throw new InvalidOperationException('SelectKBest require to fit first to get scores');
}
return $this->scores;
}
}
@@ -0,0 +1,57 @@
<?php
declare(strict_types=1);
namespace Phpml\FeatureSelection;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Math\Matrix;
use Phpml\Math\Statistic\Variance;
use Phpml\Transformer;
final class VarianceThreshold implements Transformer
{
/**
* @var float
*/
private $threshold;
/**
* @var array
*/
private $variances = [];
/**
* @var array
*/
private $keepColumns = [];
public function __construct(float $threshold = 0.0)
{
if ($threshold < 0) {
throw new InvalidArgumentException('Threshold can\'t be lower than zero');
}
$this->threshold = $threshold;
}
public function fit(array $samples, ?array $targets = null): void
{
$this->variances = array_map(static function (array $column): float {
return Variance::population($column);
}, Matrix::transposeArray($samples));
foreach ($this->variances as $column => $variance) {
if ($variance > $this->threshold) {
$this->keepColumns[$column] = true;
}
}
}
public function transform(array &$samples, ?array &$targets = null): void
{
foreach ($samples as &$sample) {
$sample = array_values(array_intersect_key($sample, $this->keepColumns));
}
}
}
@@ -0,0 +1,72 @@
<?php
declare(strict_types=1);
namespace Phpml;
use Phpml\Exception\InvalidArgumentException;
final class FeatureUnion implements Transformer
{
/**
* @var Pipeline[]
*/
private $pipelines = [];
/**
* @param Pipeline[] $pipelines
*/
public function __construct(array $pipelines)
{
if ($pipelines === []) {
throw new InvalidArgumentException('At least one pipeline is required');
}
$this->pipelines = array_map(static function (Pipeline $pipeline): Pipeline {
return $pipeline;
}, $pipelines);
}
public function fit(array $samples, ?array $targets = null): void
{
$originSamples = $samples;
foreach ($this->pipelines as $pipeline) {
foreach ($pipeline->getTransformers() as $transformer) {
$transformer->fit($samples, $targets);
$transformer->transform($samples, $targets);
}
$samples = $originSamples;
}
}
public function transform(array &$samples, ?array &$targets = null): void
{
$this->transformSamples($samples, $targets);
}
public function fitAndTransform(array &$samples, ?array &$targets = null): void
{
$this->transformSamples($samples, $targets, true);
}
private function transformSamples(array &$samples, ?array &$targets = null, bool $fit = false): void
{
$union = [];
$originSamples = $samples;
foreach ($this->pipelines as $pipeline) {
foreach ($pipeline->getTransformers() as $transformer) {
if ($fit) {
$transformer->fit($samples, $targets);
}
$transformer->transform($samples, $targets);
}
foreach ($samples as $index => $sample) {
$union[$index] = array_merge($union[$index] ?? [], is_array($sample) ? $sample : [$sample]);
}
$samples = $originSamples;
}
$samples = $union;
}
}
@@ -0,0 +1,169 @@
<?php
declare(strict_types=1);
namespace Phpml\Helper;
use Phpml\Classification\Classifier;
trait OneVsRest
{
/**
* @var array
*/
protected $classifiers = [];
/**
* All provided training targets' labels.
*
* @var array
*/
protected $allLabels = [];
/**
* @var array
*/
protected $costValues = [];
/**
* Train a binary classifier in the OvR style
*/
public function train(array $samples, array $targets): void
{
// Clears previous stuff.
$this->reset();
$this->trainByLabel($samples, $targets);
}
/**
* Resets the classifier and the vars internally used by OneVsRest to create multiple classifiers.
*/
public function reset(): void
{
$this->classifiers = [];
$this->allLabels = [];
$this->costValues = [];
$this->resetBinary();
}
protected function trainByLabel(array $samples, array $targets, array $allLabels = []): void
{
// Overwrites the current value if it exist. $allLabels must be provided for each partialTrain run.
$this->allLabels = count($allLabels) === 0 ? array_keys(array_count_values($targets)) : $allLabels;
sort($this->allLabels, SORT_STRING);
// If there are only two targets, then there is no need to perform OvR
if (count($this->allLabels) === 2) {
// Init classifier if required.
if (count($this->classifiers) === 0) {
$this->classifiers[0] = $this->getClassifierCopy();
}
$this->classifiers[0]->trainBinary($samples, $targets, $this->allLabels);
} else {
// Train a separate classifier for each label and memorize them
foreach ($this->allLabels as $label) {
// Init classifier if required.
if (!isset($this->classifiers[$label])) {
$this->classifiers[$label] = $this->getClassifierCopy();
}
[$binarizedTargets, $classifierLabels] = $this->binarizeTargets($targets, $label);
$this->classifiers[$label]->trainBinary($samples, $binarizedTargets, $classifierLabels);
}
}
// If the underlying classifier is capable of giving the cost values
// during the training, then assign it to the relevant variable
// Adding just the first classifier cost values to avoid complex average calculations.
$classifierref = reset($this->classifiers);
if (method_exists($classifierref, 'getCostValues')) {
$this->costValues = $classifierref->getCostValues();
}
}
/**
* Returns an instance of the current class after cleaning up OneVsRest stuff.
*/
protected function getClassifierCopy(): Classifier
{
// Clone the current classifier, so that
// we don't mess up its variables while training
// multiple instances of this classifier
$classifier = clone $this;
$classifier->reset();
return $classifier;
}
/**
* @return mixed
*/
protected function predictSample(array $sample)
{
if (count($this->allLabels) === 2) {
return $this->classifiers[0]->predictSampleBinary($sample);
}
$probs = [];
foreach ($this->classifiers as $label => $predictor) {
$probs[$label] = $predictor->predictProbability($sample, $label);
}
arsort($probs, SORT_NUMERIC);
return key($probs);
}
/**
* Each classifier should implement this method instead of train(samples, targets)
*/
abstract protected function trainBinary(array $samples, array $targets, array $labels);
/**
* To be overwritten by OneVsRest classifiers.
*/
abstract protected function resetBinary(): void;
/**
* Each classifier that make use of OvR approach should be able to
* return a probability for a sample to belong to the given label.
*
* @return mixed
*/
abstract protected function predictProbability(array $sample, string $label);
/**
* Each classifier should implement this method instead of predictSample()
*
* @return mixed
*/
abstract protected function predictSampleBinary(array $sample);
/**
* Groups all targets into two groups: Targets equal to
* the given label and the others
*
* $targets is not passed by reference nor contains objects so this method
* changes will not affect the caller $targets array.
*
* @param mixed $label
*
* @return array Binarized targets and target's labels
*/
private function binarizeTargets(array $targets, $label): array
{
$notLabel = "not_{$label}";
foreach ($targets as $key => $target) {
$targets[$key] = $target == $label ? $label : $notLabel;
}
$labels = [$label, $notLabel];
return [$targets, $labels];
}
}
@@ -0,0 +1,304 @@
<?php
declare(strict_types=1);
namespace Phpml\Helper\Optimizer;
use Closure;
/**
* Conjugate Gradient method to solve a non-linear f(x) with respect to unknown x
* See https://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method)
*
* The method applied below is explained in the below document in a practical manner
* - http://web.cs.iastate.edu/~cs577/handouts/conjugate-gradient.pdf
*
* However it is compliant with the general Conjugate Gradient method with
* Fletcher-Reeves update method. Note that, the f(x) is assumed to be one-dimensional
* and one gradient is utilized for all dimensions in the given data.
*/
class ConjugateGradient extends GD
{
public function runOptimization(array $samples, array $targets, Closure $gradientCb): array
{
$this->samples = $samples;
$this->targets = $targets;
$this->gradientCb = $gradientCb;
$this->sampleCount = count($samples);
$this->costValues = [];
$d = MP::muls($this->gradient($this->theta), -1);
for ($i = 0; $i < $this->maxIterations; ++$i) {
// Obtain α that minimizes f(θ + α.d)
$alpha = $this->getAlpha($d);
// θ(k+1) = θ(k) + α.d
$thetaNew = $this->getNewTheta($alpha, $d);
// β = ||∇f(x(k+1))||² ||∇f(x(k))||²
$beta = $this->getBeta($thetaNew);
// d(k+1) =–∇f(x(k+1)) + β(k).d(k)
$d = $this->getNewDirection($thetaNew, $beta, $d);
// Save values for the next iteration
$oldTheta = $this->theta;
$this->costValues[] = $this->cost($thetaNew);
$this->theta = $thetaNew;
if ($this->enableEarlyStop && $this->earlyStop($oldTheta)) {
break;
}
}
$this->clear();
return $this->theta;
}
/**
* Executes the callback function for the problem and returns
* sum of the gradient for all samples & targets.
*/
protected function gradient(array $theta): array
{
[, $updates, $penalty] = parent::gradient($theta);
// Calculate gradient for each dimension
$gradient = [];
for ($i = 0; $i <= $this->dimensions; ++$i) {
if ($i === 0) {
$gradient[$i] = array_sum($updates);
} else {
$col = array_column($this->samples, $i - 1);
$error = 0;
foreach ($col as $index => $val) {
$error += $val * $updates[$index];
}
$gradient[$i] = $error + $penalty * $theta[$i];
}
}
return $gradient;
}
/**
* Returns the value of f(x) for given solution
*/
protected function cost(array $theta): float
{
[$cost] = parent::gradient($theta);
return array_sum($cost) / (int) $this->sampleCount;
}
/**
* Calculates alpha that minimizes the function f(θ + α.d)
* by performing a line search that does not rely upon the derivation.
*
* There are several alternatives for this function. For now, we
* prefer a method inspired from the bisection method for its simplicity.
* This algorithm attempts to find an optimum alpha value between 0.0001 and 0.01
*
* Algorithm as follows:
* a) Probe a small alpha (0.0001) and calculate cost function
* b) Probe a larger alpha (0.01) and calculate cost function
* b-1) If cost function decreases, continue enlarging alpha
* b-2) If cost function increases, take the midpoint and try again
*/
protected function getAlpha(array $d): float
{
$small = MP::muls($d, 0.0001);
$large = MP::muls($d, 0.01);
// Obtain θ + α.d for two initial values, x0 and x1
$x0 = MP::add($this->theta, $small);
$x1 = MP::add($this->theta, $large);
$epsilon = 0.0001;
$iteration = 0;
do {
$fx1 = $this->cost($x1);
$fx0 = $this->cost($x0);
// If the difference between two values is small enough
// then break the loop
if (abs($fx1 - $fx0) <= $epsilon) {
break;
}
if ($fx1 < $fx0) {
$x0 = $x1;
$x1 = MP::adds($x1, 0.01); // Enlarge second
} else {
$x1 = MP::divs(MP::add($x1, $x0), 2.0);
} // Get to the midpoint
$error = $fx1 / $this->dimensions;
} while ($error <= $epsilon || $iteration++ < 10);
// Return α = θ / d
// For accuracy, choose a dimension which maximize |d[i]|
$imax = 0;
for ($i = 1; $i <= $this->dimensions; ++$i) {
if (abs($d[$i]) > abs($d[$imax])) {
$imax = $i;
}
}
if ($d[$imax] == 0) {
return $x1[$imax] - $this->theta[$imax];
}
return ($x1[$imax] - $this->theta[$imax]) / $d[$imax];
}
/**
* Calculates new set of solutions with given alpha (for each θ(k)) and
* gradient direction.
*
* θ(k+1) = θ(k) + α.d
*/
protected function getNewTheta(float $alpha, array $d): array
{
return MP::add($this->theta, MP::muls($d, $alpha));
}
/**
* Calculates new beta (β) for given set of solutions by using
* FletcherReeves method.
*
* β = ||f(x(k+1))||² ||f(x(k))||²
*
* See:
* R. Fletcher and C. M. Reeves, "Function minimization by conjugate gradients", Comput. J. 7 (1964), 149154.
*/
protected function getBeta(array $newTheta): float
{
$gNew = $this->gradient($newTheta);
$gOld = $this->gradient($this->theta);
$dNew = 0;
$dOld = 1e-100;
for ($i = 0; $i <= $this->dimensions; ++$i) {
$dNew += $gNew[$i] ** 2;
$dOld += $gOld[$i] ** 2;
}
return $dNew / $dOld;
}
/**
* Calculates the new conjugate direction
*
* d(k+1) =–∇f(x(k+1)) + β(k).d(k)
*/
protected function getNewDirection(array $theta, float $beta, array $d): array
{
$grad = $this->gradient($theta);
return MP::add(MP::muls($grad, -1), MP::muls($d, $beta));
}
}
/**
* Handles element-wise vector operations between vector-vector
* and vector-scalar variables
*/
class MP
{
/**
* Element-wise <b>multiplication</b> of two vectors of the same size
*/
public static function mul(array $m1, array $m2): array
{
$res = [];
foreach ($m1 as $i => $val) {
$res[] = $val * $m2[$i];
}
return $res;
}
/**
* Element-wise <b>division</b> of two vectors of the same size
*/
public static function div(array $m1, array $m2): array
{
$res = [];
foreach ($m1 as $i => $val) {
$res[] = $val / $m2[$i];
}
return $res;
}
/**
* Element-wise <b>addition</b> of two vectors of the same size
*/
public static function add(array $m1, array $m2, int $mag = 1): array
{
$res = [];
foreach ($m1 as $i => $val) {
$res[] = $val + $mag * $m2[$i];
}
return $res;
}
/**
* Element-wise <b>subtraction</b> of two vectors of the same size
*/
public static function sub(array $m1, array $m2): array
{
return self::add($m1, $m2, -1);
}
/**
* Element-wise <b>multiplication</b> of a vector with a scalar
*/
public static function muls(array $m1, float $m2): array
{
$res = [];
foreach ($m1 as $val) {
$res[] = $val * $m2;
}
return $res;
}
/**
* Element-wise <b>division</b> of a vector with a scalar
*/
public static function divs(array $m1, float $m2): array
{
$res = [];
foreach ($m1 as $val) {
$res[] = $val / ($m2 + 1e-32);
}
return $res;
}
/**
* Element-wise <b>addition</b> of a vector with a scalar
*/
public static function adds(array $m1, float $m2, int $mag = 1): array
{
$res = [];
foreach ($m1 as $val) {
$res[] = $val + $mag * $m2;
}
return $res;
}
/**
* Element-wise <b>subtraction</b> of a vector with a scalar
*/
public static function subs(array $m1, float $m2): array
{
return self::adds($m1, $m2, -1);
}
}
@@ -0,0 +1,111 @@
<?php
declare(strict_types=1);
namespace Phpml\Helper\Optimizer;
use Closure;
use Phpml\Exception\InvalidOperationException;
/**
* Batch version of Gradient Descent to optimize the weights
* of a classifier given samples, targets and the objective function to minimize
*/
class GD extends StochasticGD
{
/**
* Number of samples given
*
* @var int|null
*/
protected $sampleCount;
public function runOptimization(array $samples, array $targets, Closure $gradientCb): array
{
$this->samples = $samples;
$this->targets = $targets;
$this->gradientCb = $gradientCb;
$this->sampleCount = count($this->samples);
// Batch learning is executed:
$currIter = 0;
$this->costValues = [];
while ($this->maxIterations > $currIter++) {
$theta = $this->theta;
// Calculate update terms for each sample
[$errors, $updates, $totalPenalty] = $this->gradient($theta);
$this->updateWeightsWithUpdates($updates, $totalPenalty);
$this->costValues[] = array_sum($errors) / (int) $this->sampleCount;
if ($this->earlyStop($theta)) {
break;
}
}
$this->clear();
return $this->theta;
}
/**
* Calculates gradient, cost function and penalty term for each sample
* then returns them as an array of values
*/
protected function gradient(array $theta): array
{
$costs = [];
$gradient = [];
$totalPenalty = 0;
if ($this->gradientCb === null) {
throw new InvalidOperationException('Gradient callback is not defined');
}
foreach ($this->samples as $index => $sample) {
$target = $this->targets[$index];
$result = ($this->gradientCb)($theta, $sample, $target);
[$cost, $grad, $penalty] = array_pad($result, 3, 0);
$costs[] = $cost;
$gradient[] = $grad;
$totalPenalty += $penalty;
}
$totalPenalty /= $this->sampleCount;
return [$costs, $gradient, $totalPenalty];
}
protected function updateWeightsWithUpdates(array $updates, float $penalty): void
{
// Updates all weights at once
for ($i = 0; $i <= $this->dimensions; ++$i) {
if ($i === 0) {
$this->theta[0] -= $this->learningRate * array_sum($updates);
} else {
$col = array_column($this->samples, $i - 1);
$error = 0;
foreach ($col as $index => $val) {
$error += $val * $updates[$index];
}
$this->theta[$i] -= $this->learningRate *
($error + $penalty * $this->theta[$i]);
}
}
}
/**
* Clears the optimizer internal vars after the optimization process.
*/
protected function clear(): void
{
$this->sampleCount = null;
parent::clear();
}
}
@@ -0,0 +1,61 @@
<?php
declare(strict_types=1);
namespace Phpml\Helper\Optimizer;
use Closure;
use Phpml\Exception\InvalidArgumentException;
abstract class Optimizer
{
/**
* Unknown variables to be found
*
* @var array
*/
protected $theta = [];
/**
* Number of dimensions
*
* @var int
*/
protected $dimensions;
/**
* Inits a new instance of Optimizer for the given number of dimensions
*/
public function __construct(int $dimensions)
{
$this->dimensions = $dimensions;
// Inits the weights randomly
$this->theta = [];
for ($i = 0; $i < $this->dimensions; ++$i) {
$this->theta[] = (random_int(0, PHP_INT_MAX) / PHP_INT_MAX) + 0.1;
}
}
public function setTheta(array $theta): self
{
if (count($theta) !== $this->dimensions) {
throw new InvalidArgumentException(sprintf('Number of values in the weights array should be %s', $this->dimensions));
}
$this->theta = $theta;
return $this;
}
public function theta(): array
{
return $this->theta;
}
/**
* Executes the optimization with the given samples & targets
* and returns the weights
*/
abstract public function runOptimization(array $samples, array $targets, Closure $gradientCb): array;
}
@@ -0,0 +1,278 @@
<?php
declare(strict_types=1);
namespace Phpml\Helper\Optimizer;
use Closure;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Exception\InvalidOperationException;
/**
* Stochastic Gradient Descent optimization method
* to find a solution for the equation A.ϴ = y where
* A (samples) and y (targets) are known and ϴ is unknown.
*/
class StochasticGD extends Optimizer
{
/**
* A (samples)
*
* @var array
*/
protected $samples = [];
/**
* y (targets)
*
* @var array
*/
protected $targets = [];
/**
* Callback function to get the gradient and cost value
* for a specific set of theta (ϴ) and a pair of sample & target
*
* @var \Closure|null
*/
protected $gradientCb;
/**
* Maximum number of iterations used to train the model
*
* @var int
*/
protected $maxIterations = 1000;
/**
* Learning rate is used to control the speed of the optimization.<br>
*
* Larger values of lr may overshoot the optimum or even cause divergence
* while small values slows down the convergence and increases the time
* required for the training
*
* @var float
*/
protected $learningRate = 0.001;
/**
* Minimum amount of change in the weights and error values
* between iterations that needs to be obtained to continue the training
*
* @var float
*/
protected $threshold = 1e-4;
/**
* Enable/Disable early stopping by checking the weight & cost values
* to see whether they changed large enough to continue the optimization
*
* @var bool
*/
protected $enableEarlyStop = true;
/**
* List of values obtained by evaluating the cost function at each iteration
* of the algorithm
*
* @var array
*/
protected $costValues = [];
/**
* Initializes the SGD optimizer for the given number of dimensions
*/
public function __construct(int $dimensions)
{
// Add one more dimension for the bias
parent::__construct($dimensions + 1);
$this->dimensions = $dimensions;
}
public function setTheta(array $theta): Optimizer
{
if (count($theta) !== $this->dimensions + 1) {
throw new InvalidArgumentException(sprintf('Number of values in the weights array should be %s', $this->dimensions + 1));
}
$this->theta = $theta;
return $this;
}
/**
* Sets minimum value for the change in the theta values
* between iterations to continue the iterations.<br>
*
* If change in the theta is less than given value then the
* algorithm will stop training
*
* @return $this
*/
public function setChangeThreshold(float $threshold = 1e-5)
{
$this->threshold = $threshold;
return $this;
}
/**
* Enable/Disable early stopping by checking at each iteration
* whether changes in theta or cost value are not large enough
*
* @return $this
*/
public function setEarlyStop(bool $enable = true)
{
$this->enableEarlyStop = $enable;
return $this;
}
/**
* @return $this
*/
public function setLearningRate(float $learningRate)
{
$this->learningRate = $learningRate;
return $this;
}
/**
* @return $this
*/
public function setMaxIterations(int $maxIterations)
{
$this->maxIterations = $maxIterations;
return $this;
}
/**
* Optimization procedure finds the unknow variables for the equation A.ϴ = y
* for the given samples (A) and targets (y).<br>
*
* The cost function to minimize and the gradient of the function are to be
* handled by the callback function provided as the third parameter of the method.
*/
public function runOptimization(array $samples, array $targets, Closure $gradientCb): array
{
$this->samples = $samples;
$this->targets = $targets;
$this->gradientCb = $gradientCb;
$currIter = 0;
$bestTheta = null;
$bestScore = 0.0;
$this->costValues = [];
while ($this->maxIterations > $currIter++) {
$theta = $this->theta;
// Update the guess
$cost = $this->updateTheta();
// Save the best theta in the "pocket" so that
// any future set of theta worse than this will be disregarded
if ($bestTheta === null || $cost <= $bestScore) {
$bestTheta = $theta;
$bestScore = $cost;
}
// Add the cost value for this iteration to the list
$this->costValues[] = $cost;
// Check for early stop
if ($this->enableEarlyStop && $this->earlyStop($theta)) {
break;
}
}
$this->clear();
// Solution in the pocket is better than or equal to the last state
// so, we use this solution
return $this->theta = (array) $bestTheta;
}
/**
* Returns the list of cost values for each iteration executed in
* last run of the optimization
*/
public function getCostValues(): array
{
return $this->costValues;
}
protected function updateTheta(): float
{
$jValue = 0.0;
$theta = $this->theta;
if ($this->gradientCb === null) {
throw new InvalidOperationException('Gradient callback is not defined');
}
foreach ($this->samples as $index => $sample) {
$target = $this->targets[$index];
$result = ($this->gradientCb)($theta, $sample, $target);
[$error, $gradient, $penalty] = array_pad($result, 3, 0);
// Update bias
$this->theta[0] -= $this->learningRate * $gradient;
// Update other values
for ($i = 1; $i <= $this->dimensions; ++$i) {
$this->theta[$i] -= $this->learningRate *
($gradient * $sample[$i - 1] + $penalty * $this->theta[$i]);
}
// Sum error rate
$jValue += $error;
}
return $jValue / count($this->samples);
}
/**
* Checks if the optimization is not effective enough and can be stopped
* in case large enough changes in the solution do not happen
*/
protected function earlyStop(array $oldTheta): bool
{
// Check for early stop: No change larger than threshold (default 1e-5)
$diff = array_map(
function ($w1, $w2) {
return abs($w1 - $w2) > $this->threshold ? 1 : 0;
},
$oldTheta,
$this->theta
);
if (array_sum($diff) == 0) {
return true;
}
// Check if the last two cost values are almost the same
$costs = array_slice($this->costValues, -2);
if (count($costs) === 2 && abs($costs[1] - $costs[0]) < $this->threshold) {
return true;
}
return false;
}
/**
* Clears the optimizer internal vars after the optimization process.
*/
protected function clear(): void
{
$this->samples = [];
$this->targets = [];
$this->gradientCb = null;
}
}
@@ -0,0 +1,30 @@
<?php
declare(strict_types=1);
namespace Phpml\Helper;
trait Predictable
{
/**
* @return mixed
*/
public function predict(array $samples)
{
if (!is_array($samples[0])) {
return $this->predictSample($samples);
}
$predicted = [];
foreach ($samples as $index => $sample) {
$predicted[$index] = $this->predictSample($sample);
}
return $predicted;
}
/**
* @return mixed
*/
abstract protected function predictSample(array $sample);
}
@@ -0,0 +1,24 @@
<?php
declare(strict_types=1);
namespace Phpml\Helper;
trait Trainable
{
/**
* @var array
*/
private $samples = [];
/**
* @var array
*/
private $targets = [];
public function train(array $samples, array $targets): void
{
$this->samples = array_merge($this->samples, $samples);
$this->targets = array_merge($this->targets, $targets);
}
}
@@ -0,0 +1,10 @@
<?php
declare(strict_types=1);
namespace Phpml;
interface IncrementalEstimator
{
public function partialTrain(array $samples, array $targets, array $labels = []): void;
}
@@ -0,0 +1,42 @@
<?php
declare(strict_types=1);
namespace Phpml\Math;
use Phpml\Exception\InvalidArgumentException;
class Comparison
{
/**
* @param mixed $a
* @param mixed $b
*
* @throws InvalidArgumentException
*/
public static function compare($a, $b, string $operator): bool
{
switch ($operator) {
case '>':
return $a > $b;
case '>=':
return $a >= $b;
case '=':
case '==':
return $a == $b;
case '===':
return $a === $b;
case '<=':
return $a <= $b;
case '<':
return $a < $b;
case '!=':
case '<>':
return $a != $b;
case '!==':
return $a !== $b;
default:
throw new InvalidArgumentException(sprintf('Invalid operator "%s" provided', $operator));
}
}
}
@@ -0,0 +1,10 @@
<?php
declare(strict_types=1);
namespace Phpml\Math;
interface Distance
{
public function distance(array $a, array $b): float;
}
@@ -0,0 +1,19 @@
<?php
declare(strict_types=1);
namespace Phpml\Math\Distance;
/**
* Class Chebyshev
*/
class Chebyshev extends Distance
{
/**
* {@inheritdoc}
*/
public function distance(array $a, array $b): float
{
return max($this->deltas($a, $b));
}
}
@@ -0,0 +1,61 @@
<?php
declare(strict_types=1);
namespace Phpml\Math\Distance;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Math\Distance as DistanceInterface;
/**
* Class Distance
*/
abstract class Distance implements DistanceInterface
{
/**
* @var float|int
*/
public $norm;
/**
* Distance constructor.
*/
public function __construct(float $norm = 3.0)
{
$this->norm = $norm;
}
/**
* @throws InvalidArgumentException
*/
public function distance(array $a, array $b): float
{
$distance = 0;
foreach ($this->deltas($a, $b) as $delta) {
$distance += $delta ** $this->norm;
}
return $distance ** (1 / $this->norm);
}
/**
* @throws InvalidArgumentException
*/
protected function deltas(array $a, array $b): array
{
$count = count($a);
if ($count !== count($b)) {
throw new InvalidArgumentException('Size of given arrays does not match');
}
$deltas = [];
for ($i = 0; $i < $count; $i++) {
$deltas[] = abs($a[$i] - $b[$i]);
}
return $deltas;
}
}
@@ -0,0 +1,31 @@
<?php
declare(strict_types=1);
namespace Phpml\Math\Distance;
/**
* Class Euclidean
*
* L^2 Metric.
*/
class Euclidean extends Distance
{
/**
* Euclidean constructor.
*/
public function __construct()
{
parent::__construct(2.0);
}
/**
* Square of Euclidean distance
*
* @throws \Phpml\Exception\InvalidArgumentException
*/
public function sqDistance(array $a, array $b): float
{
return $this->distance($a, $b) ** 2;
}
}
@@ -0,0 +1,21 @@
<?php
declare(strict_types=1);
namespace Phpml\Math\Distance;
/**
* Class Manhattan
*
* L^1 Metric.
*/
class Manhattan extends Distance
{
/**
* Manhattan constructor.
*/
public function __construct()
{
parent::__construct(1.0);
}
}
@@ -0,0 +1,14 @@
<?php
declare(strict_types=1);
namespace Phpml\Math\Distance;
/**
* Class Minkowski
*
* L^n Metric.
*/
class Minkowski extends Distance
{
}
@@ -0,0 +1,16 @@
<?php
declare(strict_types=1);
namespace Phpml\Math;
interface Kernel
{
/**
* @param float|array $a
* @param float|array $b
*
* @return float|array
*/
public function compute($a, $b);
}
@@ -0,0 +1,34 @@
<?php
declare(strict_types=1);
namespace Phpml\Math\Kernel;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Math\Kernel;
use Phpml\Math\Product;
class RBF implements Kernel
{
/**
* @var float
*/
private $gamma;
public function __construct(float $gamma)
{
$this->gamma = $gamma;
}
public function compute($a, $b): float
{
if (!is_array($a) || !is_array($b)) {
throw new InvalidArgumentException(sprintf('Arguments of %s must be arrays', __METHOD__));
}
$score = 2 * Product::scalar($a, $b);
$squares = Product::scalar($a, $a) + Product::scalar($b, $b);
return exp(-$this->gamma * ($squares - $score));
}
}
@@ -0,0 +1,960 @@
<?php
declare(strict_types=1);
/**
* Class to obtain eigenvalues and eigenvectors of a real matrix.
*
* If A is symmetric, then A = V*D*V' where the eigenvalue matrix D
* is diagonal and the eigenvector matrix V is orthogonal (i.e.
* A = V.times(D.times(V.transpose())) and V.times(V.transpose())
* equals the identity matrix).
*
* If A is not symmetric, then the eigenvalue matrix D is block diagonal
* with the real eigenvalues in 1-by-1 blocks and any complex eigenvalues,
* lambda + i*mu, in 2-by-2 blocks, [lambda, mu; -mu, lambda]. The
* columns of V represent the eigenvectors in the sense that A*V = V*D,
* i.e. A.times(V) equals V.times(D). The matrix V may be badly
* conditioned, or even singular, so the validity of the equation
* A = V*D*inverse(V) depends upon V.cond().
*
* @author Paul Meagher
* @license PHP v3.0
*
* @version 1.1
*
* Slightly changed to adapt the original code to PHP-ML library
* @date 2017/04/11
*
* @author Mustafa Karabulut
*/
namespace Phpml\Math\LinearAlgebra;
use Phpml\Math\Matrix;
class EigenvalueDecomposition
{
/**
* Row and column dimension (square matrix).
*
* @var int
*/
private $n;
/**
* Arrays for internal storage of eigenvalues.
*
* @var array
*/
private $d = [];
/**
* @var array
*/
private $e = [];
/**
* Array for internal storage of eigenvectors.
*
* @var array
*/
private $V = [];
/**
* Array for internal storage of nonsymmetric Hessenberg form.
*
* @var array
*/
private $H = [];
/**
* Working storage for nonsymmetric algorithm.
*
* @var array
*/
private $ort = [];
/**
* Used for complex scalar division.
*
* @var float
*/
private $cdivr;
/**
* @var float
*/
private $cdivi;
/**
* Constructor: Check for symmetry, then construct the eigenvalue decomposition
*/
public function __construct(array $arg)
{
$this->n = count($arg[0]);
$symmetric = true;
for ($j = 0; ($j < $this->n) & $symmetric; ++$j) {
for ($i = 0; ($i < $this->n) & $symmetric; ++$i) {
$symmetric = $arg[$i][$j] == $arg[$j][$i];
}
}
if ($symmetric) {
$this->V = $arg;
// Tridiagonalize.
$this->tred2();
// Diagonalize.
$this->tql2();
} else {
$this->H = $arg;
$this->ort = [];
// Reduce to Hessenberg form.
$this->orthes();
// Reduce Hessenberg to real Schur form.
$this->hqr2();
}
}
/**
* Return the eigenvector matrix
*/
public function getEigenvectors(): array
{
$vectors = $this->V;
// Always return the eigenvectors of length 1.0
$vectors = new Matrix($vectors);
$vectors = array_map(function ($vect) {
$sum = 0;
$count = count($vect);
for ($i = 0; $i < $count; ++$i) {
$sum += $vect[$i] ** 2;
}
$sum **= .5;
for ($i = 0; $i < $count; ++$i) {
$vect[$i] /= $sum;
}
return $vect;
}, $vectors->transpose()->toArray());
return $vectors;
}
/**
* Return the real parts of the eigenvalues<br>
* d = real(diag(D));
*/
public function getRealEigenvalues(): array
{
return $this->d;
}
/**
* Return the imaginary parts of the eigenvalues <br>
* d = imag(diag(D))
*/
public function getImagEigenvalues(): array
{
return $this->e;
}
/**
* Return the block diagonal eigenvalue matrix
*/
public function getDiagonalEigenvalues(): array
{
$D = [];
for ($i = 0; $i < $this->n; ++$i) {
$D[$i] = array_fill(0, $this->n, 0.0);
$D[$i][$i] = $this->d[$i];
if ($this->e[$i] == 0) {
continue;
}
$o = $this->e[$i] > 0 ? $i + 1 : $i - 1;
$D[$i][$o] = $this->e[$i];
}
return $D;
}
/**
* Symmetric Householder reduction to tridiagonal form.
*/
private function tred2(): void
{
// This is derived from the Algol procedures tred2 by
// Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
// Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
// Fortran subroutine in EISPACK.
$this->d = $this->V[$this->n - 1];
// Householder reduction to tridiagonal form.
for ($i = $this->n - 1; $i > 0; --$i) {
$i_ = $i - 1;
// Scale to avoid under/overflow.
$h = $scale = 0.0;
$scale += array_sum(array_map('abs', $this->d));
if ($scale == 0.0) {
$this->e[$i] = $this->d[$i_];
$this->d = array_slice($this->V[$i_], 0, $this->n - 1);
for ($j = 0; $j < $i; ++$j) {
$this->V[$j][$i] = $this->V[$i][$j] = 0.0;
}
} else {
// Generate Householder vector.
for ($k = 0; $k < $i; ++$k) {
$this->d[$k] /= $scale;
$h += $this->d[$k] ** 2;
}
$f = $this->d[$i_];
$g = $h ** .5;
if ($f > 0) {
$g = -$g;
}
$this->e[$i] = $scale * $g;
$h -= $f * $g;
$this->d[$i_] = $f - $g;
for ($j = 0; $j < $i; ++$j) {
$this->e[$j] = 0.0;
}
// Apply similarity transformation to remaining columns.
for ($j = 0; $j < $i; ++$j) {
$f = $this->d[$j];
$this->V[$j][$i] = $f;
$g = $this->e[$j] + $this->V[$j][$j] * $f;
for ($k = $j + 1; $k <= $i_; ++$k) {
$g += $this->V[$k][$j] * $this->d[$k];
$this->e[$k] += $this->V[$k][$j] * $f;
}
$this->e[$j] = $g;
}
$f = 0.0;
if ($h == 0.0) {
$h = 1e-32;
}
for ($j = 0; $j < $i; ++$j) {
$this->e[$j] /= $h;
$f += $this->e[$j] * $this->d[$j];
}
$hh = $f / (2 * $h);
for ($j = 0; $j < $i; ++$j) {
$this->e[$j] -= $hh * $this->d[$j];
}
for ($j = 0; $j < $i; ++$j) {
$f = $this->d[$j];
$g = $this->e[$j];
for ($k = $j; $k <= $i_; ++$k) {
$this->V[$k][$j] -= ($f * $this->e[$k] + $g * $this->d[$k]);
}
$this->d[$j] = $this->V[$i - 1][$j];
$this->V[$i][$j] = 0.0;
}
}
$this->d[$i] = $h;
}
// Accumulate transformations.
for ($i = 0; $i < $this->n - 1; ++$i) {
$this->V[$this->n - 1][$i] = $this->V[$i][$i];
$this->V[$i][$i] = 1.0;
$h = $this->d[$i + 1];
if ($h != 0.0) {
for ($k = 0; $k <= $i; ++$k) {
$this->d[$k] = $this->V[$k][$i + 1] / $h;
}
for ($j = 0; $j <= $i; ++$j) {
$g = 0.0;
for ($k = 0; $k <= $i; ++$k) {
$g += $this->V[$k][$i + 1] * $this->V[$k][$j];
}
for ($k = 0; $k <= $i; ++$k) {
$this->V[$k][$j] -= $g * $this->d[$k];
}
}
}
for ($k = 0; $k <= $i; ++$k) {
$this->V[$k][$i + 1] = 0.0;
}
}
$this->d = $this->V[$this->n - 1];
$this->V[$this->n - 1] = array_fill(0, $this->n, 0.0);
$this->V[$this->n - 1][$this->n - 1] = 1.0;
$this->e[0] = 0.0;
}
/**
* Symmetric tridiagonal QL algorithm.
*
* This is derived from the Algol procedures tql2, by
* Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
* Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
* Fortran subroutine in EISPACK.
*/
private function tql2(): void
{
for ($i = 1; $i < $this->n; ++$i) {
$this->e[$i - 1] = $this->e[$i];
}
$this->e[$this->n - 1] = 0.0;
$f = 0.0;
$tst1 = 0.0;
$eps = 2.0 ** -52.0;
for ($l = 0; $l < $this->n; ++$l) {
// Find small subdiagonal element
$tst1 = max($tst1, abs($this->d[$l]) + abs($this->e[$l]));
$m = $l;
while ($m < $this->n) {
if (abs($this->e[$m]) <= $eps * $tst1) {
break;
}
++$m;
}
// If m == l, $this->d[l] is an eigenvalue,
// otherwise, iterate.
if ($m > $l) {
do {
// Compute implicit shift
$g = $this->d[$l];
$p = ($this->d[$l + 1] - $g) / (2.0 * $this->e[$l]);
$r = hypot($p, 1.0);
if ($p < 0) {
$r *= -1;
}
$this->d[$l] = $this->e[$l] / ($p + $r);
$this->d[$l + 1] = $this->e[$l] * ($p + $r);
$dl1 = $this->d[$l + 1];
$h = $g - $this->d[$l];
for ($i = $l + 2; $i < $this->n; ++$i) {
$this->d[$i] -= $h;
}
$f += $h;
// Implicit QL transformation.
$p = $this->d[$m];
$c = 1.0;
$c2 = $c3 = $c;
$el1 = $this->e[$l + 1];
$s = $s2 = 0.0;
for ($i = $m - 1; $i >= $l; --$i) {
$c3 = $c2;
$c2 = $c;
$s2 = $s;
$g = $c * $this->e[$i];
$h = $c * $p;
$r = hypot($p, $this->e[$i]);
$this->e[$i + 1] = $s * $r;
$s = $this->e[$i] / $r;
$c = $p / $r;
$p = $c * $this->d[$i] - $s * $g;
$this->d[$i + 1] = $h + $s * ($c * $g + $s * $this->d[$i]);
// Accumulate transformation.
for ($k = 0; $k < $this->n; ++$k) {
$h = $this->V[$k][$i + 1];
$this->V[$k][$i + 1] = $s * $this->V[$k][$i] + $c * $h;
$this->V[$k][$i] = $c * $this->V[$k][$i] - $s * $h;
}
}
$p = -$s * $s2 * $c3 * $el1 * $this->e[$l] / $dl1;
$this->e[$l] = $s * $p;
$this->d[$l] = $c * $p;
// Check for convergence.
} while (abs($this->e[$l]) > $eps * $tst1);
}
$this->d[$l] += $f;
$this->e[$l] = 0.0;
}
// Sort eigenvalues and corresponding vectors.
for ($i = 0; $i < $this->n - 1; ++$i) {
$k = $i;
$p = $this->d[$i];
for ($j = $i + 1; $j < $this->n; ++$j) {
if ($this->d[$j] < $p) {
$k = $j;
$p = $this->d[$j];
}
}
if ($k != $i) {
$this->d[$k] = $this->d[$i];
$this->d[$i] = $p;
for ($j = 0; $j < $this->n; ++$j) {
$p = $this->V[$j][$i];
$this->V[$j][$i] = $this->V[$j][$k];
$this->V[$j][$k] = $p;
}
}
}
}
/**
* Nonsymmetric reduction to Hessenberg form.
*
* This is derived from the Algol procedures orthes and ortran,
* by Martin and Wilkinson, Handbook for Auto. Comp.,
* Vol.ii-Linear Algebra, and the corresponding
* Fortran subroutines in EISPACK.
*/
private function orthes(): void
{
$low = 0;
$high = $this->n - 1;
for ($m = $low + 1; $m <= $high - 1; ++$m) {
// Scale column.
$scale = 0.0;
for ($i = $m; $i <= $high; ++$i) {
$scale += abs($this->H[$i][$m - 1]);
}
if ($scale != 0.0) {
// Compute Householder transformation.
$h = 0.0;
for ($i = $high; $i >= $m; --$i) {
$this->ort[$i] = $this->H[$i][$m - 1] / $scale;
$h += $this->ort[$i] * $this->ort[$i];
}
$g = $h ** .5;
if ($this->ort[$m] > 0) {
$g *= -1;
}
$h -= $this->ort[$m] * $g;
$this->ort[$m] -= $g;
// Apply Householder similarity transformation
// H = (I -u * u' / h) * H * (I -u * u') / h)
for ($j = $m; $j < $this->n; ++$j) {
$f = 0.0;
for ($i = $high; $i >= $m; --$i) {
$f += $this->ort[$i] * $this->H[$i][$j];
}
$f /= $h;
for ($i = $m; $i <= $high; ++$i) {
$this->H[$i][$j] -= $f * $this->ort[$i];
}
}
for ($i = 0; $i <= $high; ++$i) {
$f = 0.0;
for ($j = $high; $j >= $m; --$j) {
$f += $this->ort[$j] * $this->H[$i][$j];
}
$f /= $h;
for ($j = $m; $j <= $high; ++$j) {
$this->H[$i][$j] -= $f * $this->ort[$j];
}
}
$this->ort[$m] = $scale * $this->ort[$m];
$this->H[$m][$m - 1] = $scale * $g;
}
}
// Accumulate transformations (Algol's ortran).
for ($i = 0; $i < $this->n; ++$i) {
for ($j = 0; $j < $this->n; ++$j) {
$this->V[$i][$j] = ($i == $j ? 1.0 : 0.0);
}
}
for ($m = $high - 1; $m >= $low + 1; --$m) {
if ($this->H[$m][$m - 1] != 0.0) {
for ($i = $m + 1; $i <= $high; ++$i) {
$this->ort[$i] = $this->H[$i][$m - 1];
}
for ($j = $m; $j <= $high; ++$j) {
$g = 0.0;
for ($i = $m; $i <= $high; ++$i) {
$g += $this->ort[$i] * $this->V[$i][$j];
}
// Double division avoids possible underflow
$g /= $this->ort[$m];
$g /= $this->H[$m][$m - 1];
for ($i = $m; $i <= $high; ++$i) {
$this->V[$i][$j] += $g * $this->ort[$i];
}
}
}
}
}
/**
* Performs complex division.
*
* @param int|float $xr
* @param int|float $xi
* @param int|float $yr
* @param int|float $yi
*/
private function cdiv($xr, $xi, $yr, $yi): void
{
if (abs($yr) > abs($yi)) {
$r = $yi / $yr;
$d = $yr + $r * $yi;
$this->cdivr = ($xr + $r * $xi) / $d;
$this->cdivi = ($xi - $r * $xr) / $d;
} else {
$r = $yr / $yi;
$d = $yi + $r * $yr;
$this->cdivr = ($r * $xr + $xi) / $d;
$this->cdivi = ($r * $xi - $xr) / $d;
}
}
/**
* Nonsymmetric reduction from Hessenberg to real Schur form.
*
* Code is derived from the Algol procedure hqr2,
* by Martin and Wilkinson, Handbook for Auto. Comp.,
* Vol.ii-Linear Algebra, and the corresponding
* Fortran subroutine in EISPACK.
*/
private function hqr2(): void
{
// Initialize
$nn = $this->n;
$n = $nn - 1;
$low = 0;
$high = $nn - 1;
$eps = 2.0 ** -52.0;
$exshift = 0.0;
$p = $q = $r = $s = $z = 0;
// Store roots isolated by balanc and compute matrix norm
$norm = 0.0;
for ($i = 0; $i < $nn; ++$i) {
if (($i < $low) or ($i > $high)) {
$this->d[$i] = $this->H[$i][$i];
$this->e[$i] = 0.0;
}
for ($j = max($i - 1, 0); $j < $nn; ++$j) {
$norm += abs($this->H[$i][$j]);
}
}
// Outer loop over eigenvalue index
$iter = 0;
while ($n >= $low) {
// Look for single small sub-diagonal element
$l = $n;
while ($l > $low) {
$s = abs($this->H[$l - 1][$l - 1]) + abs($this->H[$l][$l]);
if ($s == 0.0) {
$s = $norm;
}
if (abs($this->H[$l][$l - 1]) < $eps * $s) {
break;
}
--$l;
}
// Check for convergence
// One root found
if ($l == $n) {
$this->H[$n][$n] += $exshift;
$this->d[$n] = $this->H[$n][$n];
$this->e[$n] = 0.0;
--$n;
$iter = 0;
// Two roots found
} elseif ($l == $n - 1) {
$w = $this->H[$n][$n - 1] * $this->H[$n - 1][$n];
$p = ($this->H[$n - 1][$n - 1] - $this->H[$n][$n]) / 2.0;
$q = $p * $p + $w;
$z = abs($q) ** .5;
$this->H[$n][$n] += $exshift;
$this->H[$n - 1][$n - 1] += $exshift;
$x = $this->H[$n][$n];
// Real pair
if ($q >= 0) {
if ($p >= 0) {
$z = $p + $z;
} else {
$z = $p - $z;
}
$this->d[$n - 1] = $x + $z;
$this->d[$n] = $this->d[$n - 1];
if ($z != 0.0) {
$this->d[$n] = $x - $w / $z;
}
$this->e[$n - 1] = 0.0;
$this->e[$n] = 0.0;
$x = $this->H[$n][$n - 1];
$s = abs($x) + abs($z);
$p = $x / $s;
$q = $z / $s;
$r = ($p * $p + $q * $q) ** .5;
$p /= $r;
$q /= $r;
// Row modification
for ($j = $n - 1; $j < $nn; ++$j) {
$z = $this->H[$n - 1][$j];
$this->H[$n - 1][$j] = $q * $z + $p * $this->H[$n][$j];
$this->H[$n][$j] = $q * $this->H[$n][$j] - $p * $z;
}
// Column modification
for ($i = 0; $i <= $n; ++$i) {
$z = $this->H[$i][$n - 1];
$this->H[$i][$n - 1] = $q * $z + $p * $this->H[$i][$n];
$this->H[$i][$n] = $q * $this->H[$i][$n] - $p * $z;
}
// Accumulate transformations
for ($i = $low; $i <= $high; ++$i) {
$z = $this->V[$i][$n - 1];
$this->V[$i][$n - 1] = $q * $z + $p * $this->V[$i][$n];
$this->V[$i][$n] = $q * $this->V[$i][$n] - $p * $z;
}
// Complex pair
} else {
$this->d[$n - 1] = $x + $p;
$this->d[$n] = $x + $p;
$this->e[$n - 1] = $z;
$this->e[$n] = -$z;
}
$n -= 2;
$iter = 0;
// No convergence yet
} else {
// Form shift
$x = $this->H[$n][$n];
$y = 0.0;
$w = 0.0;
if ($l < $n) {
$y = $this->H[$n - 1][$n - 1];
$w = $this->H[$n][$n - 1] * $this->H[$n - 1][$n];
}
// Wilkinson's original ad hoc shift
if ($iter == 10) {
$exshift += $x;
for ($i = $low; $i <= $n; ++$i) {
$this->H[$i][$i] -= $x;
}
$s = abs($this->H[$n][$n - 1]) + abs($this->H[$n - 1][$n - 2]);
$x = $y = 0.75 * $s;
$w = -0.4375 * $s * $s;
}
// MATLAB's new ad hoc shift
if ($iter == 30) {
$s = ($y - $x) / 2.0;
$s *= $s + $w;
if ($s > 0) {
$s **= .5;
if ($y < $x) {
$s = -$s;
}
$s = $x - $w / (($y - $x) / 2.0 + $s);
for ($i = $low; $i <= $n; ++$i) {
$this->H[$i][$i] -= $s;
}
$exshift += $s;
$x = $y = $w = 0.964;
}
}
// Could check iteration count here.
++$iter;
// Look for two consecutive small sub-diagonal elements
$m = $n - 2;
while ($m >= $l) {
$z = $this->H[$m][$m];
$r = $x - $z;
$s = $y - $z;
$p = ($r * $s - $w) / $this->H[$m + 1][$m] + $this->H[$m][$m + 1];
$q = $this->H[$m + 1][$m + 1] - $z - $r - $s;
$r = $this->H[$m + 2][$m + 1];
$s = abs($p) + abs($q) + abs($r);
$p /= $s;
$q /= $s;
$r /= $s;
if ($m == $l) {
break;
}
if (abs($this->H[$m][$m - 1]) * (abs($q) + abs($r)) <
$eps * (abs($p) * (abs($this->H[$m - 1][$m - 1]) + abs($z) + abs($this->H[$m + 1][$m + 1])))) {
break;
}
--$m;
}
for ($i = $m + 2; $i <= $n; ++$i) {
$this->H[$i][$i - 2] = 0.0;
if ($i > $m + 2) {
$this->H[$i][$i - 3] = 0.0;
}
}
// Double QR step involving rows l:n and columns m:n
for ($k = $m; $k <= $n - 1; ++$k) {
$notlast = $k != $n - 1;
if ($k != $m) {
$p = $this->H[$k][$k - 1];
$q = $this->H[$k + 1][$k - 1];
$r = ($notlast ? $this->H[$k + 2][$k - 1] : 0.0);
$x = abs($p) + abs($q) + abs($r);
if ($x != 0.0) {
$p /= $x;
$q /= $x;
$r /= $x;
}
}
if ($x == 0.0) {
break;
}
$s = ($p * $p + $q * $q + $r * $r) ** .5;
if ($p < 0) {
$s = -$s;
}
if ($s != 0) {
if ($k != $m) {
$this->H[$k][$k - 1] = -$s * $x;
} elseif ($l != $m) {
$this->H[$k][$k - 1] = -$this->H[$k][$k - 1];
}
$p += $s;
$x = $p / $s;
$y = $q / $s;
$z = $r / $s;
$q /= $p;
$r /= $p;
// Row modification
for ($j = $k; $j < $nn; ++$j) {
$p = $this->H[$k][$j] + $q * $this->H[$k + 1][$j];
if ($notlast) {
$p += $r * $this->H[$k + 2][$j];
$this->H[$k + 2][$j] -= $p * $z;
}
$this->H[$k][$j] -= $p * $x;
$this->H[$k + 1][$j] -= $p * $y;
}
// Column modification
for ($i = 0; $i <= min($n, $k + 3); ++$i) {
$p = $x * $this->H[$i][$k] + $y * $this->H[$i][$k + 1];
if ($notlast) {
$p += $z * $this->H[$i][$k + 2];
$this->H[$i][$k + 2] -= $p * $r;
}
$this->H[$i][$k] -= $p;
$this->H[$i][$k + 1] -= $p * $q;
}
// Accumulate transformations
for ($i = $low; $i <= $high; ++$i) {
$p = $x * $this->V[$i][$k] + $y * $this->V[$i][$k + 1];
if ($notlast) {
$p += $z * $this->V[$i][$k + 2];
$this->V[$i][$k + 2] -= $p * $r;
}
$this->V[$i][$k] -= $p;
$this->V[$i][$k + 1] -= $p * $q;
}
} // ($s != 0)
} // k loop
} // check convergence
} // while ($n >= $low)
// Backsubstitute to find vectors of upper triangular form
if ($norm == 0.0) {
return;
}
for ($n = $nn - 1; $n >= 0; --$n) {
$p = $this->d[$n];
$q = $this->e[$n];
// Real vector
if ($q == 0) {
$l = $n;
$this->H[$n][$n] = 1.0;
for ($i = $n - 1; $i >= 0; --$i) {
$w = $this->H[$i][$i] - $p;
$r = 0.0;
for ($j = $l; $j <= $n; ++$j) {
$r += $this->H[$i][$j] * $this->H[$j][$n];
}
if ($this->e[$i] < 0.0) {
$z = $w;
$s = $r;
} else {
$l = $i;
if ($this->e[$i] == 0.0) {
if ($w != 0.0) {
$this->H[$i][$n] = -$r / $w;
} else {
$this->H[$i][$n] = -$r / ($eps * $norm);
}
// Solve real equations
} else {
$x = $this->H[$i][$i + 1];
$y = $this->H[$i + 1][$i];
$q = ($this->d[$i] - $p) * ($this->d[$i] - $p) + $this->e[$i] * $this->e[$i];
$t = ($x * $s - $z * $r) / $q;
$this->H[$i][$n] = $t;
if (abs($x) > abs($z)) {
$this->H[$i + 1][$n] = (-$r - $w * $t) / $x;
} else {
$this->H[$i + 1][$n] = (-$s - $y * $t) / $z;
}
}
// Overflow control
$t = abs($this->H[$i][$n]);
if (($eps * $t) * $t > 1) {
for ($j = $i; $j <= $n; ++$j) {
$this->H[$j][$n] /= $t;
}
}
}
}
// Complex vector
} elseif ($q < 0) {
$l = $n - 1;
// Last vector component imaginary so matrix is triangular
if (abs($this->H[$n][$n - 1]) > abs($this->H[$n - 1][$n])) {
$this->H[$n - 1][$n - 1] = $q / $this->H[$n][$n - 1];
$this->H[$n - 1][$n] = -($this->H[$n][$n] - $p) / $this->H[$n][$n - 1];
} else {
$this->cdiv(0.0, -$this->H[$n - 1][$n], $this->H[$n - 1][$n - 1] - $p, $q);
$this->H[$n - 1][$n - 1] = $this->cdivr;
$this->H[$n - 1][$n] = $this->cdivi;
}
$this->H[$n][$n - 1] = 0.0;
$this->H[$n][$n] = 1.0;
for ($i = $n - 2; $i >= 0; --$i) {
// double ra,sa,vr,vi;
$ra = 0.0;
$sa = 0.0;
for ($j = $l; $j <= $n; ++$j) {
$ra += $this->H[$i][$j] * $this->H[$j][$n - 1];
$sa += $this->H[$i][$j] * $this->H[$j][$n];
}
$w = $this->H[$i][$i] - $p;
if ($this->e[$i] < 0.0) {
$z = $w;
$r = $ra;
$s = $sa;
} else {
$l = $i;
if ($this->e[$i] == 0) {
$this->cdiv(-$ra, -$sa, $w, $q);
$this->H[$i][$n - 1] = $this->cdivr;
$this->H[$i][$n] = $this->cdivi;
} else {
// Solve complex equations
$x = $this->H[$i][$i + 1];
$y = $this->H[$i + 1][$i];
$vr = ($this->d[$i] - $p) * ($this->d[$i] - $p) + $this->e[$i] * $this->e[$i] - $q * $q;
$vi = ($this->d[$i] - $p) * 2.0 * $q;
if ($vr == 0.0 && $vi == 0.0) {
$vr = $eps * $norm * (abs($w) + abs($q) + abs($x) + abs($y) + abs($z));
}
$this->cdiv($x * $r - $z * $ra + $q * $sa, $x * $s - $z * $sa - $q * $ra, $vr, $vi);
$this->H[$i][$n - 1] = $this->cdivr;
$this->H[$i][$n] = $this->cdivi;
if (abs($x) > (abs($z) + abs($q))) {
$this->H[$i + 1][$n - 1] = (-$ra - $w * $this->H[$i][$n - 1] + $q * $this->H[$i][$n]) / $x;
$this->H[$i + 1][$n] = (-$sa - $w * $this->H[$i][$n] - $q * $this->H[$i][$n - 1]) / $x;
} else {
$this->cdiv(-$r - $y * $this->H[$i][$n - 1], -$s - $y * $this->H[$i][$n], $z, $q);
$this->H[$i + 1][$n - 1] = $this->cdivr;
$this->H[$i + 1][$n] = $this->cdivi;
}
}
// Overflow control
$t = max(abs($this->H[$i][$n - 1]), abs($this->H[$i][$n]));
if (($eps * $t) * $t > 1) {
for ($j = $i; $j <= $n; ++$j) {
$this->H[$j][$n - 1] /= $t;
$this->H[$j][$n] /= $t;
}
}
} // end else
} // end for
} // end else for complex case
} // end for
// Vectors of isolated roots
for ($i = 0; $i < $nn; ++$i) {
if ($i < $low || $i > $high) {
for ($j = $i; $j < $nn; ++$j) {
$this->V[$i][$j] = $this->H[$i][$j];
}
}
}
// Back transformation to get eigenvectors of original matrix
for ($j = $nn - 1; $j >= $low; --$j) {
for ($i = $low; $i <= $high; ++$i) {
$z = 0.0;
for ($k = $low; $k <= min($j, $high); ++$k) {
$z += $this->V[$i][$k] * $this->H[$k][$j];
}
$this->V[$i][$j] = $z;
}
}
}
}
@@ -0,0 +1,299 @@
<?php
declare(strict_types=1);
/**
* @package JAMA
*
* For an m-by-n matrix A with m >= n, the LU decomposition is an m-by-n
* unit lower triangular matrix L, an n-by-n upper triangular matrix U,
* and a permutation vector piv of length m so that A(piv,:) = L*U.
* If m < n, then L is m-by-m and U is m-by-n.
*
* The LU decompostion with pivoting always exists, even if the matrix is
* singular, so the constructor will never fail. The primary use of the
* LU decomposition is in the solution of square systems of simultaneous
* linear equations. This will fail if isNonsingular() returns false.
*
* @author Paul Meagher
* @author Bartosz Matosiuk
* @author Michael Bommarito
*
* @version 1.1
*
* @license PHP v3.0
*
* Slightly changed to adapt the original code to PHP-ML library
* @date 2017/04/24
*
* @author Mustafa Karabulut
*/
namespace Phpml\Math\LinearAlgebra;
use Phpml\Exception\MatrixException;
use Phpml\Math\Matrix;
class LUDecomposition
{
/**
* Decomposition storage
*
* @var array
*/
private $LU = [];
/**
* Row dimension.
*
* @var int
*/
private $m;
/**
* Column dimension.
*
* @var int
*/
private $n;
/**
* Pivot sign.
*
* @var int
*/
private $pivsign;
/**
* Internal storage of pivot vector.
*
* @var array
*/
private $piv = [];
/**
* Constructs Structure to access L, U and piv.
*
* @param Matrix $A Rectangular matrix
*
* @throws MatrixException
*/
public function __construct(Matrix $A)
{
if ($A->getRows() !== $A->getColumns()) {
throw new MatrixException('Matrix is not square matrix');
}
// Use a "left-looking", dot-product, Crout/Doolittle algorithm.
$this->LU = $A->toArray();
$this->m = $A->getRows();
$this->n = $A->getColumns();
for ($i = 0; $i < $this->m; ++$i) {
$this->piv[$i] = $i;
}
$this->pivsign = 1;
$LUcolj = [];
// Outer loop.
for ($j = 0; $j < $this->n; ++$j) {
// Make a copy of the j-th column to localize references.
for ($i = 0; $i < $this->m; ++$i) {
$LUcolj[$i] = &$this->LU[$i][$j];
}
// Apply previous transformations.
for ($i = 0; $i < $this->m; ++$i) {
$LUrowi = $this->LU[$i];
// Most of the time is spent in the following dot product.
$kmax = min($i, $j);
$s = 0.0;
for ($k = 0; $k < $kmax; ++$k) {
$s += $LUrowi[$k] * $LUcolj[$k];
}
$LUrowi[$j] = $LUcolj[$i] -= $s;
}
// Find pivot and exchange if necessary.
$p = $j;
for ($i = $j + 1; $i < $this->m; ++$i) {
if (abs($LUcolj[$i] ?? 0) > abs($LUcolj[$p] ?? 0)) {
$p = $i;
}
}
if ($p != $j) {
for ($k = 0; $k < $this->n; ++$k) {
$t = $this->LU[$p][$k];
$this->LU[$p][$k] = $this->LU[$j][$k];
$this->LU[$j][$k] = $t;
}
$k = $this->piv[$p];
$this->piv[$p] = $this->piv[$j];
$this->piv[$j] = $k;
$this->pivsign *= -1;
}
// Compute multipliers.
if (($j < $this->m) && ($this->LU[$j][$j] != 0.0)) {
for ($i = $j + 1; $i < $this->m; ++$i) {
$this->LU[$i][$j] /= $this->LU[$j][$j];
}
}
}
}
/**
* Get lower triangular factor.
*
* @return Matrix Lower triangular factor
*/
public function getL(): Matrix
{
$L = [];
for ($i = 0; $i < $this->m; ++$i) {
for ($j = 0; $j < $this->n; ++$j) {
if ($i > $j) {
$L[$i][$j] = $this->LU[$i][$j];
} elseif ($i == $j) {
$L[$i][$j] = 1.0;
} else {
$L[$i][$j] = 0.0;
}
}
}
return new Matrix($L);
}
/**
* Get upper triangular factor.
*
* @return Matrix Upper triangular factor
*/
public function getU(): Matrix
{
$U = [];
for ($i = 0; $i < $this->n; ++$i) {
for ($j = 0; $j < $this->n; ++$j) {
if ($i <= $j) {
$U[$i][$j] = $this->LU[$i][$j];
} else {
$U[$i][$j] = 0.0;
}
}
}
return new Matrix($U);
}
/**
* Return pivot permutation vector.
*
* @return array Pivot vector
*/
public function getPivot(): array
{
return $this->piv;
}
/**
* Alias for getPivot
*
* @see getPivot
*/
public function getDoublePivot(): array
{
return $this->getPivot();
}
/**
* Is the matrix nonsingular?
*
* @return bool true if U, and hence A, is nonsingular.
*/
public function isNonsingular(): bool
{
for ($j = 0; $j < $this->n; ++$j) {
if ($this->LU[$j][$j] == 0) {
return false;
}
}
return true;
}
public function det(): float
{
$d = $this->pivsign;
for ($j = 0; $j < $this->n; ++$j) {
$d *= $this->LU[$j][$j];
}
return (float) $d;
}
/**
* Solve A*X = B
*
* @param Matrix $B A Matrix with as many rows as A and any number of columns.
*
* @return array X so that L*U*X = B(piv,:)
*
* @throws MatrixException
*/
public function solve(Matrix $B): array
{
if ($B->getRows() != $this->m) {
throw new MatrixException('Matrix is not square matrix');
}
if (!$this->isNonsingular()) {
throw new MatrixException('Matrix is singular');
}
// Copy right hand side with pivoting
$nx = $B->getColumns();
$X = $this->getSubMatrix($B->toArray(), $this->piv, 0, $nx - 1);
// Solve L*Y = B(piv,:)
for ($k = 0; $k < $this->n; ++$k) {
for ($i = $k + 1; $i < $this->n; ++$i) {
for ($j = 0; $j < $nx; ++$j) {
$X[$i][$j] -= $X[$k][$j] * $this->LU[$i][$k];
}
}
}
// Solve U*X = Y;
for ($k = $this->n - 1; $k >= 0; --$k) {
for ($j = 0; $j < $nx; ++$j) {
$X[$k][$j] /= $this->LU[$k][$k];
}
for ($i = 0; $i < $k; ++$i) {
for ($j = 0; $j < $nx; ++$j) {
$X[$i][$j] -= $X[$k][$j] * $this->LU[$i][$k];
}
}
}
return $X;
}
protected function getSubMatrix(array $matrix, array $RL, int $j0, int $jF): array
{
$m = count($RL);
$n = $jF - $j0;
$R = array_fill(0, $m, array_fill(0, $n + 1, 0.0));
for ($i = 0; $i < $m; ++$i) {
for ($j = $j0; $j <= $jF; ++$j) {
$R[$i][$j - $j0] = $matrix[$RL[$i]][$j];
}
}
return $R;
}
}
@@ -0,0 +1,327 @@
<?php
declare(strict_types=1);
namespace Phpml\Math;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Exception\MatrixException;
use Phpml\Math\LinearAlgebra\LUDecomposition;
class Matrix
{
/**
* @var array
*/
private $matrix = [];
/**
* @var int
*/
private $rows;
/**
* @var int
*/
private $columns;
/**
* @var float
*/
private $determinant;
/**
* @throws InvalidArgumentException
*/
public function __construct(array $matrix, bool $validate = true)
{
// When a row vector is given
if (!is_array($matrix[0])) {
$this->rows = 1;
$this->columns = count($matrix);
$matrix = [$matrix];
} else {
$this->rows = count($matrix);
$this->columns = count($matrix[0]);
}
if ($validate) {
for ($i = 0; $i < $this->rows; ++$i) {
if (count($matrix[$i]) !== $this->columns) {
throw new InvalidArgumentException('Matrix dimensions did not match');
}
}
}
$this->matrix = $matrix;
}
public static function fromFlatArray(array $array): self
{
$matrix = [];
foreach ($array as $value) {
$matrix[] = [$value];
}
return new self($matrix);
}
public function toArray(): array
{
return $this->matrix;
}
public function toScalar(): float
{
return $this->matrix[0][0];
}
public function getRows(): int
{
return $this->rows;
}
public function getColumns(): int
{
return $this->columns;
}
/**
* @throws MatrixException
*/
public function getColumnValues(int $column): array
{
if ($column >= $this->columns) {
throw new MatrixException('Column out of range');
}
return array_column($this->matrix, $column);
}
/**
* @return float|int
*
* @throws MatrixException
*/
public function getDeterminant()
{
if ($this->determinant !== null) {
return $this->determinant;
}
if (!$this->isSquare()) {
throw new MatrixException('Matrix is not square matrix');
}
$lu = new LUDecomposition($this);
return $this->determinant = $lu->det();
}
public function isSquare(): bool
{
return $this->columns === $this->rows;
}
public function transpose(): self
{
if ($this->rows === 1) {
$matrix = array_map(static function ($el): array {
return [$el];
}, $this->matrix[0]);
} else {
$matrix = array_map(null, ...$this->matrix);
}
return new self($matrix, false);
}
public function multiply(self $matrix): self
{
if ($this->columns !== $matrix->getRows()) {
throw new InvalidArgumentException('Inconsistent matrix supplied');
}
$array1 = $this->toArray();
$array2 = $matrix->toArray();
$colCount = $matrix->columns;
/*
- To speed-up multiplication, we need to avoid use of array index operator [ ] as much as possible( See #255 for details)
- A combination of "foreach" and "array_column" works much faster then accessing the array via index operator
*/
$product = [];
foreach ($array1 as $row => $rowData) {
for ($col = 0; $col < $colCount; ++$col) {
$columnData = array_column($array2, $col);
$sum = 0;
foreach ($rowData as $key => $valueData) {
$sum += $valueData * $columnData[$key];
}
$product[$row][$col] = $sum;
}
}
return new self($product, false);
}
/**
* @param float|int $value
*/
public function divideByScalar($value): self
{
$newMatrix = [];
for ($i = 0; $i < $this->rows; ++$i) {
for ($j = 0; $j < $this->columns; ++$j) {
$newMatrix[$i][$j] = $this->matrix[$i][$j] / $value;
}
}
return new self($newMatrix, false);
}
/**
* @param float|int $value
*/
public function multiplyByScalar($value): self
{
$newMatrix = [];
for ($i = 0; $i < $this->rows; ++$i) {
for ($j = 0; $j < $this->columns; ++$j) {
$newMatrix[$i][$j] = $this->matrix[$i][$j] * $value;
}
}
return new self($newMatrix, false);
}
/**
* Element-wise addition of the matrix with another one
*/
public function add(self $other): self
{
return $this->sum($other);
}
/**
* Element-wise subtracting of another matrix from this one
*/
public function subtract(self $other): self
{
return $this->sum($other, -1);
}
public function inverse(): self
{
if (!$this->isSquare()) {
throw new MatrixException('Matrix is not square matrix');
}
$LU = new LUDecomposition($this);
$identity = $this->getIdentity();
$inverse = $LU->solve($identity);
return new self($inverse, false);
}
public function crossOut(int $row, int $column): self
{
$newMatrix = [];
$r = 0;
for ($i = 0; $i < $this->rows; ++$i) {
$c = 0;
if ($row != $i) {
for ($j = 0; $j < $this->columns; ++$j) {
if ($column != $j) {
$newMatrix[$r][$c] = $this->matrix[$i][$j];
++$c;
}
}
++$r;
}
}
return new self($newMatrix, false);
}
public function isSingular(): bool
{
return $this->getDeterminant() == 0;
}
/**
* Frobenius norm (HilbertSchmidt norm, Euclidean norm) (‖A‖F)
* Square root of the sum of the square of all elements.
*
* https://en.wikipedia.org/wiki/Matrix_norm#Frobenius_norm
*
* _____________
* /ᵐ ⁿ
* ‖A‖F = √ Σ Σ |aᵢⱼ|²
* ᵢ₌₁ ᵢ₌₁
*/
public function frobeniusNorm(): float
{
$squareSum = 0;
for ($i = 0; $i < $this->rows; ++$i) {
for ($j = 0; $j < $this->columns; ++$j) {
$squareSum += $this->matrix[$i][$j] ** 2;
}
}
return $squareSum ** .5;
}
/**
* Returns the transpose of given array
*/
public static function transposeArray(array $array): array
{
return (new self($array, false))->transpose()->toArray();
}
/**
* Returns the dot product of two arrays<br>
* Matrix::dot(x, y) ==> x.y'
*/
public static function dot(array $array1, array $array2): array
{
$m1 = new self($array1, false);
$m2 = new self($array2, false);
return $m1->multiply($m2->transpose())->toArray()[0];
}
/**
* Element-wise addition or substraction depending on the given sign parameter
*/
private function sum(self $other, int $sign = 1): self
{
$a1 = $this->toArray();
$a2 = $other->toArray();
$newMatrix = [];
for ($i = 0; $i < $this->rows; ++$i) {
for ($k = 0; $k < $this->columns; ++$k) {
$newMatrix[$i][$k] = $a1[$i][$k] + $sign * $a2[$i][$k];
}
}
return new self($newMatrix, false);
}
/**
* Returns diagonal identity matrix of the same size of this matrix
*/
private function getIdentity(): self
{
$array = array_fill(0, $this->rows, array_fill(0, $this->columns, 0));
for ($i = 0; $i < $this->rows; ++$i) {
$array[$i][$i] = 1;
}
return new self($array, false);
}
}
@@ -0,0 +1,23 @@
<?php
declare(strict_types=1);
namespace Phpml\Math;
class Product
{
/**
* @return mixed
*/
public static function scalar(array $a, array $b)
{
$product = 0;
foreach ($a as $index => $value) {
if (is_numeric($value) && is_numeric($b[$index])) {
$product += (float) $value * (float) $b[$index];
}
}
return $product;
}
}
@@ -0,0 +1,173 @@
<?php
declare(strict_types=1);
namespace Phpml\Math;
use ArrayIterator;
use IteratorAggregate;
class Set implements IteratorAggregate
{
/**
* @var string[]|int[]|float[]|bool[]
*/
private $elements = [];
/**
* @param string[]|int[]|float[]|bool[] $elements
*/
public function __construct(array $elements = [])
{
$this->elements = self::sanitize($elements);
}
/**
* Creates the union of A and B.
*/
public static function union(self $a, self $b): self
{
return new self(array_merge($a->toArray(), $b->toArray()));
}
/**
* Creates the intersection of A and B.
*/
public static function intersection(self $a, self $b): self
{
return new self(array_intersect($a->toArray(), $b->toArray()));
}
/**
* Creates the difference of A and B.
*/
public static function difference(self $a, self $b): self
{
return new self(array_diff($a->toArray(), $b->toArray()));
}
/**
* Creates the Cartesian product of A and B.
*
* @return Set[]
*/
public static function cartesian(self $a, self $b): array
{
$cartesian = [];
foreach ($a as $multiplier) {
foreach ($b as $multiplicand) {
$cartesian[] = new self(array_merge([$multiplicand], [$multiplier]));
}
}
return $cartesian;
}
/**
* Creates the power set of A.
*
* @return Set[]
*/
public static function power(self $a): array
{
$power = [new self()];
foreach ($a as $multiplicand) {
foreach ($power as $multiplier) {
$power[] = new self(array_merge([$multiplicand], $multiplier->toArray()));
}
}
return $power;
}
/**
* @param string|int|float|bool $element
*/
public function add($element): self
{
return $this->addAll([$element]);
}
/**
* @param string[]|int[]|float[]|bool[] $elements
*/
public function addAll(array $elements): self
{
$this->elements = self::sanitize(array_merge($this->elements, $elements));
return $this;
}
/**
* @param string|int|float $element
*/
public function remove($element): self
{
return $this->removeAll([$element]);
}
/**
* @param string[]|int[]|float[] $elements
*/
public function removeAll(array $elements): self
{
$this->elements = self::sanitize(array_diff($this->elements, $elements));
return $this;
}
/**
* @param string|int|float $element
*/
public function contains($element): bool
{
return $this->containsAll([$element]);
}
/**
* @param string[]|int[]|float[] $elements
*/
public function containsAll(array $elements): bool
{
return count(array_diff($elements, $this->elements)) === 0;
}
/**
* @return string[]|int[]|float[]|bool[]
*/
public function toArray(): array
{
return $this->elements;
}
public function getIterator(): ArrayIterator
{
return new ArrayIterator($this->elements);
}
public function isEmpty(): bool
{
return $this->cardinality() === 0;
}
public function cardinality(): int
{
return count($this->elements);
}
/**
* Removes duplicates and rewrites index.
*
* @param string[]|int[]|float[]|bool[] $elements
*
* @return string[]|int[]|float[]|bool[]
*/
private static function sanitize(array $elements): array
{
sort($elements, SORT_ASC);
return array_values(array_unique($elements, SORT_ASC));
}
}
@@ -0,0 +1,141 @@
<?php
declare(strict_types=1);
namespace Phpml\Math\Statistic;
use Phpml\Exception\InvalidArgumentException;
/**
* Analysis of variance
* https://en.wikipedia.org/wiki/Analysis_of_variance
*/
final class ANOVA
{
/**
* The one-way ANOVA tests the null hypothesis that 2 or more groups have
* the same population mean. The test is applied to samples from two or
* more groups, possibly with differing sizes.
*
* @param array[] $samples - each row is class samples
*
* @return float[]
*/
public static function oneWayF(array $samples): array
{
$classes = count($samples);
if ($classes < 2) {
throw new InvalidArgumentException('The array must have at least 2 elements');
}
$samplesPerClass = array_map(static function (array $class): int {
return count($class);
}, $samples);
$allSamples = (int) array_sum($samplesPerClass);
$ssAllSamples = self::sumOfSquaresPerFeature($samples);
$sumSamples = self::sumOfFeaturesPerClass($samples);
$squareSumSamples = self::sumOfSquares($sumSamples);
$sumSamplesSquare = self::squaresSum($sumSamples);
$ssbn = self::calculateSsbn($samples, $sumSamplesSquare, $samplesPerClass, $squareSumSamples, $allSamples);
$sswn = self::calculateSswn($ssbn, $ssAllSamples, $squareSumSamples, $allSamples);
$dfbn = $classes - 1;
$dfwn = $allSamples - $classes;
$msb = array_map(static function ($s) use ($dfbn) {
return $s / $dfbn;
}, $ssbn);
$msw = array_map(static function ($s) use ($dfwn) {
if ($dfwn === 0) {
return 1;
}
return $s / $dfwn;
}, $sswn);
$f = [];
foreach ($msb as $index => $msbValue) {
$f[$index] = $msbValue / $msw[$index];
}
return $f;
}
private static function sumOfSquaresPerFeature(array $samples): array
{
$sum = array_fill(0, count($samples[0][0]), 0);
foreach ($samples as $class) {
foreach ($class as $sample) {
foreach ($sample as $index => $feature) {
$sum[$index] += $feature ** 2;
}
}
}
return $sum;
}
private static function sumOfFeaturesPerClass(array $samples): array
{
return array_map(static function (array $class): array {
$sum = array_fill(0, count($class[0]), 0);
foreach ($class as $sample) {
foreach ($sample as $index => $feature) {
$sum[$index] += $feature;
}
}
return $sum;
}, $samples);
}
private static function sumOfSquares(array $sums): array
{
$squares = array_fill(0, count($sums[0]), 0);
foreach ($sums as $row) {
foreach ($row as $index => $sum) {
$squares[$index] += $sum;
}
}
return array_map(static function ($sum) {
return $sum ** 2;
}, $squares);
}
private static function squaresSum(array $sums): array
{
foreach ($sums as &$row) {
foreach ($row as &$sum) {
$sum **= 2;
}
}
return $sums;
}
private static function calculateSsbn(array $samples, array $sumSamplesSquare, array $samplesPerClass, array $squareSumSamples, int $allSamples): array
{
$ssbn = array_fill(0, count($samples[0][0]), 0);
foreach ($sumSamplesSquare as $classIndex => $class) {
foreach ($class as $index => $feature) {
$ssbn[$index] += $feature / $samplesPerClass[$classIndex];
}
}
foreach ($squareSumSamples as $index => $sum) {
$ssbn[$index] -= $sum / $allSamples;
}
return $ssbn;
}
private static function calculateSswn(array $ssbn, array $ssAllSamples, array $squareSumSamples, int $allSamples): array
{
$sswn = [];
foreach ($ssAllSamples as $index => $ss) {
$sswn[$index] = ($ss - $squareSumSamples[$index] / $allSamples) - $ssbn[$index];
}
return $sswn;
}
}
@@ -0,0 +1,41 @@
<?php
declare(strict_types=1);
namespace Phpml\Math\Statistic;
use Phpml\Exception\InvalidArgumentException;
class Correlation
{
/**
* @param int[]|float[] $x
* @param int[]|float[] $y
*
* @throws InvalidArgumentException
*/
public static function pearson(array $x, array $y): float
{
if (count($x) !== count($y)) {
throw new InvalidArgumentException('Size of given arrays does not match');
}
$count = count($x);
$meanX = Mean::arithmetic($x);
$meanY = Mean::arithmetic($y);
$axb = 0;
$a2 = 0;
$b2 = 0;
for ($i = 0; $i < $count; ++$i) {
$a = $x[$i] - $meanX;
$b = $y[$i] - $meanY;
$axb += ($a * $b);
$a2 += $a ** 2;
$b2 += $b ** 2;
}
return $axb / ($a2 * $b2) ** .5;
}
}
@@ -0,0 +1,148 @@
<?php
declare(strict_types=1);
namespace Phpml\Math\Statistic;
use Phpml\Exception\InvalidArgumentException;
class Covariance
{
/**
* Calculates covariance from two given arrays, x and y, respectively
*
* @throws InvalidArgumentException
*/
public static function fromXYArrays(array $x, array $y, bool $sample = true, ?float $meanX = null, ?float $meanY = null): float
{
$n = count($x);
if ($n === 0 || count($y) === 0) {
throw new InvalidArgumentException('The array has zero elements');
}
if ($sample && $n === 1) {
throw new InvalidArgumentException('The array must have at least 2 elements');
}
if ($meanX === null) {
$meanX = Mean::arithmetic($x);
}
if ($meanY === null) {
$meanY = Mean::arithmetic($y);
}
$sum = 0.0;
foreach ($x as $index => $xi) {
$yi = $y[$index];
$sum += ($xi - $meanX) * ($yi - $meanY);
}
if ($sample) {
--$n;
}
return $sum / $n;
}
/**
* Calculates covariance of two dimensions, i and k in the given data.
*
* @throws InvalidArgumentException
* @throws \Exception
*/
public static function fromDataset(array $data, int $i, int $k, bool $sample = true, ?float $meanX = null, ?float $meanY = null): float
{
if (count($data) === 0) {
throw new InvalidArgumentException('The array has zero elements');
}
$n = count($data);
if ($sample && $n === 1) {
throw new InvalidArgumentException('The array must have at least 2 elements');
}
if ($i < 0 || $k < 0 || $i >= $n || $k >= $n) {
throw new InvalidArgumentException('Given indices i and k do not match with the dimensionality of data');
}
if ($meanX === null || $meanY === null) {
$x = array_column($data, $i);
$y = array_column($data, $k);
$meanX = Mean::arithmetic($x);
$meanY = Mean::arithmetic($y);
$sum = 0.0;
foreach ($x as $index => $xi) {
$yi = $y[$index];
$sum += ($xi - $meanX) * ($yi - $meanY);
}
} else {
// In the case, whole dataset given along with dimension indices, i and k,
// we would like to avoid getting column data with array_column and operate
// over this extra copy of column data for memory efficiency purposes.
//
// Instead we traverse through the whole data and get what we actually need
// without copying the data. This way, memory use will be reduced
// with a slight cost of CPU utilization.
$sum = 0.0;
foreach ($data as $row) {
$val = [0, 0];
foreach ($row as $index => $col) {
if ($index == $i) {
$val[0] = $col - $meanX;
}
if ($index == $k) {
$val[1] = $col - $meanY;
}
}
$sum += $val[0] * $val[1];
}
}
if ($sample) {
--$n;
}
return $sum / $n;
}
/**
* Returns the covariance matrix of n-dimensional data
*
* @param array|null $means
*/
public static function covarianceMatrix(array $data, ?array $means = null): array
{
$n = count($data[0]);
if ($means === null) {
$means = [];
for ($i = 0; $i < $n; ++$i) {
$means[] = Mean::arithmetic(array_column($data, $i));
}
}
$cov = [];
for ($i = 0; $i < $n; ++$i) {
for ($k = 0; $k < $n; ++$k) {
if ($i > $k) {
$cov[$i][$k] = $cov[$k][$i];
} else {
$cov[$i][$k] = self::fromDataset(
$data,
$i,
$k,
true,
$means[$i],
$means[$k]
);
}
}
}
return $cov;
}
}
@@ -0,0 +1,50 @@
<?php
declare(strict_types=1);
namespace Phpml\Math\Statistic;
class Gaussian
{
/**
* @var float
*/
protected $mean;
/**
* @var float
*/
protected $std;
public function __construct(float $mean, float $std)
{
$this->mean = $mean;
$this->std = $std;
}
/**
* Returns probability density of the given <i>$value</i>
*
* @return float|int
*/
public function pdf(float $value)
{
// Calculate the probability density by use of normal/Gaussian distribution
// Ref: https://en.wikipedia.org/wiki/Normal_distribution
$std2 = $this->std ** 2;
$mean = $this->mean;
return exp(-(($value - $mean) ** 2) / (2 * $std2)) / ((2 * $std2 * M_PI) ** .5);
}
/**
* Returns probability density value of the given <i>$value</i> based on
* given standard deviation and the mean
*/
public static function distributionPdf(float $mean, float $std, float $value): float
{
$normal = new self($mean, $std);
return $normal->pdf($value);
}
}
@@ -0,0 +1,65 @@
<?php
declare(strict_types=1);
namespace Phpml\Math\Statistic;
use Phpml\Exception\InvalidArgumentException;
class Mean
{
/**
* @throws InvalidArgumentException
*/
public static function arithmetic(array $numbers): float
{
self::checkArrayLength($numbers);
return array_sum($numbers) / count($numbers);
}
/**
* @return float|mixed
*
* @throws InvalidArgumentException
*/
public static function median(array $numbers)
{
self::checkArrayLength($numbers);
$count = count($numbers);
$middleIndex = (int) floor($count / 2);
sort($numbers, SORT_NUMERIC);
$median = $numbers[$middleIndex];
if ($count % 2 === 0) {
$median = ($median + $numbers[$middleIndex - 1]) / 2;
}
return $median;
}
/**
* @return mixed
*
* @throws InvalidArgumentException
*/
public static function mode(array $numbers)
{
self::checkArrayLength($numbers);
$values = array_count_values($numbers);
return array_search(max($values), $values, true);
}
/**
* @throws InvalidArgumentException
*/
private static function checkArrayLength(array $array): void
{
if (count($array) === 0) {
throw new InvalidArgumentException('The array has zero elements');
}
}
}
@@ -0,0 +1,59 @@
<?php
declare(strict_types=1);
namespace Phpml\Math\Statistic;
use Phpml\Exception\InvalidArgumentException;
class StandardDeviation
{
/**
* @param float[]|int[] $numbers
*/
public static function population(array $numbers, bool $sample = true): float
{
$n = count($numbers);
if ($n === 0) {
throw new InvalidArgumentException('The array has zero elements');
}
if ($sample && $n === 1) {
throw new InvalidArgumentException('The array must have at least 2 elements');
}
$mean = Mean::arithmetic($numbers);
$carry = 0.0;
foreach ($numbers as $val) {
$carry += ($val - $mean) ** 2;
}
if ($sample) {
--$n;
}
return ($carry / $n) ** .5;
}
/**
* Sum of squares deviations
* ∑⟮xᵢ - μ⟯²
*
* @param float[]|int[] $numbers
*/
public static function sumOfSquares(array $numbers): float
{
if (count($numbers) === 0) {
throw new InvalidArgumentException('The array has zero elements');
}
$mean = Mean::arithmetic($numbers);
return array_sum(array_map(
static function ($val) use ($mean): float {
return ($val - $mean) ** 2;
},
$numbers
));
}
}
@@ -0,0 +1,27 @@
<?php
declare(strict_types=1);
namespace Phpml\Math\Statistic;
/**
* In probability theory and statistics, variance is the expectation of the squared deviation of a random variable from its mean.
* Informally, it measures how far a set of (random) numbers are spread out from their average value
* https://en.wikipedia.org/wiki/Variance
*/
final class Variance
{
/**
* Population variance
* Use when all possible observations of the system are present.
* If used with a subset of data (sample variance), it will be a biased variance.
*
* ∑⟮xᵢ - μ⟯²
* σ² = ----------
* N
*/
public static function population(array $population): float
{
return StandardDeviation::sumOfSquares($population) / count($population);
}
}
@@ -0,0 +1,35 @@
<?php
declare(strict_types=1);
namespace Phpml\Metric;
use Phpml\Exception\InvalidArgumentException;
class Accuracy
{
/**
* @return float|int
*
* @throws InvalidArgumentException
*/
public static function score(array $actualLabels, array $predictedLabels, bool $normalize = true)
{
if (count($actualLabels) != count($predictedLabels)) {
throw new InvalidArgumentException('Size of given arrays does not match');
}
$score = 0;
foreach ($actualLabels as $index => $label) {
if ($label == $predictedLabels[$index]) {
++$score;
}
}
if ($normalize) {
$score /= count($actualLabels);
}
return $score;
}
}
@@ -0,0 +1,226 @@
<?php
declare(strict_types=1);
namespace Phpml\Metric;
use Phpml\Exception\InvalidArgumentException;
class ClassificationReport
{
public const MICRO_AVERAGE = 1;
public const MACRO_AVERAGE = 2;
public const WEIGHTED_AVERAGE = 3;
/**
* @var array
*/
private $truePositive = [];
/**
* @var array
*/
private $falsePositive = [];
/**
* @var array
*/
private $falseNegative = [];
/**
* @var array
*/
private $support = [];
/**
* @var array
*/
private $precision = [];
/**
* @var array
*/
private $recall = [];
/**
* @var array
*/
private $f1score = [];
/**
* @var array
*/
private $average = [];
public function __construct(array $actualLabels, array $predictedLabels, int $average = self::MACRO_AVERAGE)
{
$averagingMethods = range(self::MICRO_AVERAGE, self::WEIGHTED_AVERAGE);
if (!in_array($average, $averagingMethods, true)) {
throw new InvalidArgumentException('Averaging method must be MICRO_AVERAGE, MACRO_AVERAGE or WEIGHTED_AVERAGE');
}
$this->aggregateClassificationResults($actualLabels, $predictedLabels);
$this->computeMetrics();
$this->computeAverage($average);
}
public function getPrecision(): array
{
return $this->precision;
}
public function getRecall(): array
{
return $this->recall;
}
public function getF1score(): array
{
return $this->f1score;
}
public function getSupport(): array
{
return $this->support;
}
public function getAverage(): array
{
return $this->average;
}
private function aggregateClassificationResults(array $actualLabels, array $predictedLabels): void
{
$truePositive = $falsePositive = $falseNegative = $support = self::getLabelIndexedArray($actualLabels, $predictedLabels);
foreach ($actualLabels as $index => $actual) {
$predicted = $predictedLabels[$index];
++$support[$actual];
if ($actual === $predicted) {
++$truePositive[$actual];
} else {
++$falsePositive[$predicted];
++$falseNegative[$actual];
}
}
$this->truePositive = $truePositive;
$this->falsePositive = $falsePositive;
$this->falseNegative = $falseNegative;
$this->support = $support;
}
private function computeMetrics(): void
{
foreach ($this->truePositive as $label => $tp) {
$this->precision[$label] = $this->computePrecision($tp, $this->falsePositive[$label]);
$this->recall[$label] = $this->computeRecall($tp, $this->falseNegative[$label]);
$this->f1score[$label] = $this->computeF1Score((float) $this->precision[$label], (float) $this->recall[$label]);
}
}
private function computeAverage(int $average): void
{
switch ($average) {
case self::MICRO_AVERAGE:
$this->computeMicroAverage();
return;
case self::MACRO_AVERAGE:
$this->computeMacroAverage();
return;
case self::WEIGHTED_AVERAGE:
$this->computeWeightedAverage();
return;
}
}
private function computeMicroAverage(): void
{
$truePositive = (int) array_sum($this->truePositive);
$falsePositive = (int) array_sum($this->falsePositive);
$falseNegative = (int) array_sum($this->falseNegative);
$precision = $this->computePrecision($truePositive, $falsePositive);
$recall = $this->computeRecall($truePositive, $falseNegative);
$f1score = $this->computeF1Score($precision, $recall);
$this->average = compact('precision', 'recall', 'f1score');
}
private function computeMacroAverage(): void
{
foreach (['precision', 'recall', 'f1score'] as $metric) {
$values = $this->{$metric};
if (count($values) == 0) {
$this->average[$metric] = 0.0;
continue;
}
$this->average[$metric] = array_sum($values) / count($values);
}
}
private function computeWeightedAverage(): void
{
foreach (['precision', 'recall', 'f1score'] as $metric) {
$values = $this->{$metric};
if (count($values) == 0) {
$this->average[$metric] = 0.0;
continue;
}
$sum = 0;
foreach ($values as $i => $value) {
$sum += $value * $this->support[$i];
}
$this->average[$metric] = $sum / array_sum($this->support);
}
}
private function computePrecision(int $truePositive, int $falsePositive): float
{
$divider = $truePositive + $falsePositive;
if ($divider == 0) {
return 0.0;
}
return $truePositive / $divider;
}
private function computeRecall(int $truePositive, int $falseNegative): float
{
$divider = $truePositive + $falseNegative;
if ($divider == 0) {
return 0.0;
}
return $truePositive / $divider;
}
private function computeF1Score(float $precision, float $recall): float
{
$divider = $precision + $recall;
if ($divider == 0) {
return 0.0;
}
return 2.0 * (($precision * $recall) / $divider);
}
private static function getLabelIndexedArray(array $actualLabels, array $predictedLabels): array
{
$labels = array_values(array_unique(array_merge($actualLabels, $predictedLabels)));
sort($labels);
return (array) array_combine($labels, array_fill(0, count($labels), 0));
}
}
@@ -0,0 +1,53 @@
<?php
declare(strict_types=1);
namespace Phpml\Metric;
class ConfusionMatrix
{
public static function compute(array $actualLabels, array $predictedLabels, array $labels = []): array
{
$labels = count($labels) === 0 ? self::getUniqueLabels($actualLabels) : array_flip($labels);
$matrix = self::generateMatrixWithZeros($labels);
foreach ($actualLabels as $index => $actual) {
$predicted = $predictedLabels[$index];
if (!isset($labels[$actual], $labels[$predicted])) {
continue;
}
if ($predicted === $actual) {
$row = $column = $labels[$actual];
} else {
$row = $labels[$actual];
$column = $labels[$predicted];
}
++$matrix[$row][$column];
}
return $matrix;
}
private static function generateMatrixWithZeros(array $labels): array
{
$count = count($labels);
$matrix = [];
for ($i = 0; $i < $count; ++$i) {
$matrix[$i] = array_fill(0, $count, 0);
}
return $matrix;
}
private static function getUniqueLabels(array $labels): array
{
$labels = array_values(array_unique($labels));
sort($labels);
return array_flip($labels);
}
}
@@ -0,0 +1,86 @@
<?php
declare(strict_types=1);
namespace Phpml\Metric;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Math\Statistic\Correlation;
use Phpml\Math\Statistic\Mean;
final class Regression
{
public static function meanSquaredError(array $targets, array $predictions): float
{
self::assertCountEquals($targets, $predictions);
$errors = [];
foreach ($targets as $index => $target) {
$errors[] = (($target - $predictions[$index]) ** 2);
}
return Mean::arithmetic($errors);
}
public static function meanSquaredLogarithmicError(array $targets, array $predictions): float
{
self::assertCountEquals($targets, $predictions);
$errors = [];
foreach ($targets as $index => $target) {
$errors[] = log((1 + $target) / (1 + $predictions[$index])) ** 2;
}
return Mean::arithmetic($errors);
}
public static function meanAbsoluteError(array $targets, array $predictions): float
{
self::assertCountEquals($targets, $predictions);
$errors = [];
foreach ($targets as $index => $target) {
$errors[] = abs($target - $predictions[$index]);
}
return Mean::arithmetic($errors);
}
public static function medianAbsoluteError(array $targets, array $predictions): float
{
self::assertCountEquals($targets, $predictions);
$errors = [];
foreach ($targets as $index => $target) {
$errors[] = abs($target - $predictions[$index]);
}
return (float) Mean::median($errors);
}
public static function r2Score(array $targets, array $predictions): float
{
self::assertCountEquals($targets, $predictions);
return Correlation::pearson($targets, $predictions) ** 2;
}
public static function maxError(array $targets, array $predictions): float
{
self::assertCountEquals($targets, $predictions);
$errors = [];
foreach ($targets as $index => $target) {
$errors[] = abs($target - $predictions[$index]);
}
return (float) max($errors);
}
private static function assertCountEquals(array &$targets, array &$predictions): void
{
if (count($targets) !== count($predictions)) {
throw new InvalidArgumentException('Targets count must be equal with predictions count');
}
}
}
@@ -0,0 +1,42 @@
<?php
declare(strict_types=1);
namespace Phpml;
use Phpml\Exception\FileException;
use Phpml\Exception\SerializeException;
class ModelManager
{
public function saveToFile(Estimator $estimator, string $filepath): void
{
if (!is_writable(dirname($filepath))) {
throw new FileException(sprintf('File "%s" cannot be saved.', basename($filepath)));
}
$serialized = serialize($estimator);
if (!isset($serialized[0])) {
throw new SerializeException(sprintf('Class "%s" cannot be serialized.', gettype($estimator)));
}
$result = file_put_contents($filepath, $serialized, LOCK_EX);
if ($result === false) {
throw new FileException(sprintf('File "%s" cannot be saved.', basename($filepath)));
}
}
public function restoreFromFile(string $filepath): Estimator
{
if (!file_exists($filepath) || !is_readable($filepath)) {
throw new FileException(sprintf('File "%s" cannot be opened.', basename($filepath)));
}
$object = unserialize((string) file_get_contents($filepath));
if ($object === false || !$object instanceof Estimator) {
throw new SerializeException(sprintf('"%s" cannot be unserialized.', basename($filepath)));
}
return $object;
}
}
@@ -0,0 +1,19 @@
<?php
declare(strict_types=1);
namespace Phpml\NeuralNetwork;
interface ActivationFunction
{
/**
* @param float|int $value
*/
public function compute($value): float;
/**
* @param float|int $value
* @param float|int $computedvalue
*/
public function differentiate($value, $computedvalue): float;
}
@@ -0,0 +1,31 @@
<?php
declare(strict_types=1);
namespace Phpml\NeuralNetwork\ActivationFunction;
use Phpml\NeuralNetwork\ActivationFunction;
class BinaryStep implements ActivationFunction
{
/**
* @param float|int $value
*/
public function compute($value): float
{
return $value >= 0 ? 1.0 : 0.0;
}
/**
* @param float|int $value
* @param float|int $computedvalue
*/
public function differentiate($value, $computedvalue): float
{
if ($value === 0 || $value === 0.0) {
return 1;
}
return 0;
}
}

Some files were not shown because too many files have changed in this diff Show More