1<?php 2// This file is part of Moodle - http://moodle.org/ 3// 4// Moodle is free software: you can redistribute it and/or modify 5// it under the terms of the GNU General Public License as published by 6// the Free Software Foundation, either version 3 of the License, or 7// (at your option) any later version. 8// 9// Moodle is distributed in the hope that it will be useful, 10// but WITHOUT ANY WARRANTY; without even the implied warranty of 11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12// GNU General Public License for more details. 13// 14// You should have received a copy of the GNU General Public License 15// along with Moodle. If not, see <http://www.gnu.org/licenses/>. 16 17/** 18 * Php predictions processor 19 * 20 * @package mlbackend_php 21 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com} 22 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later 23 */ 24 25namespace mlbackend_php; 26 27defined('MOODLE_INTERNAL') || die(); 28 29use Phpml\Preprocessing\Normalizer; 30use Phpml\CrossValidation\RandomSplit; 31use Phpml\Dataset\ArrayDataset; 32use Phpml\ModelManager; 33use Phpml\Classification\Linear\LogisticRegression; 34use Phpml\Metric\ClassificationReport; 35 36/** 37 * PHP predictions processor. 38 * 39 * @package mlbackend_php 40 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com} 41 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later 42 */ 43class processor implements \core_analytics\classifier, \core_analytics\regressor, \core_analytics\packable { 44 45 /** 46 * Size of training / prediction batches. 47 */ 48 const BATCH_SIZE = 5000; 49 50 /** 51 * Number of train iterations. 52 */ 53 const TRAIN_ITERATIONS = 500; 54 55 /** 56 * File name of the serialised model. 57 */ 58 const MODEL_FILENAME = 'model.ser'; 59 60 /** 61 * @var bool 62 */ 63 protected $limitedsize = false; 64 65 /** 66 * Checks if the processor is ready to use. 67 * 68 * @return bool 69 */ 70 public function is_ready() { 71 if (version_compare(phpversion(), '7.0.0') < 0) { 72 return get_string('errorphp7required', 'mlbackend_php'); 73 } 74 return true; 75 } 76 77 /** 78 * Delete the stored models. 79 * 80 * @param string $uniqueid 81 * @param string $modelversionoutputdir 82 * @return null 83 */ 84 public function clear_model($uniqueid, $modelversionoutputdir) { 85 remove_dir($modelversionoutputdir); 86 } 87 88 /** 89 * Delete the output directory. 90 * 91 * @param string $modeloutputdir 92 * @param string $uniqueid 93 * @return null 94 */ 95 public function delete_output_dir($modeloutputdir, $uniqueid) { 96 remove_dir($modeloutputdir); 97 } 98 99 /** 100 * Train this processor classification model using the provided supervised learning dataset. 101 * 102 * @param string $uniqueid 103 * @param \stored_file $dataset 104 * @param string $outputdir 105 * @return \stdClass 106 */ 107 public function train_classification($uniqueid, \stored_file $dataset, $outputdir) { 108 109 $modelfilepath = $this->get_model_filepath($outputdir); 110 111 $modelmanager = new ModelManager(); 112 113 if (file_exists($modelfilepath)) { 114 $classifier = $modelmanager->restoreFromFile($modelfilepath); 115 } else { 116 $classifier = $this->instantiate_algorithm(); 117 } 118 119 $fh = $dataset->get_content_file_handle(); 120 121 // The first lines are var names and the second one values. 122 $metadata = $this->extract_metadata($fh); 123 124 // Skip headers. 125 fgets($fh); 126 127 $samples = array(); 128 $targets = array(); 129 while (($data = fgetcsv($fh)) !== false) { 130 $sampledata = array_map('floatval', $data); 131 $samples[] = array_slice($sampledata, 0, $metadata['nfeatures']); 132 $targets[] = intval($data[$metadata['nfeatures']]); 133 134 $nsamples = count($samples); 135 if ($nsamples === self::BATCH_SIZE) { 136 // Training it batches to avoid running out of memory. 137 $classifier->partialTrain($samples, $targets, json_decode($metadata['targetclasses'])); 138 $samples = array(); 139 $targets = array(); 140 } 141 if (empty($morethan1sample) && $nsamples > 1) { 142 $morethan1sample = true; 143 } 144 } 145 fclose($fh); 146 147 if (empty($morethan1sample)) { 148 $resultobj = new \stdClass(); 149 $resultobj->status = \core_analytics\model::NO_DATASET; 150 $resultobj->info = array(); 151 return $resultobj; 152 } 153 154 // Train the remaining samples. 155 if ($samples) { 156 $classifier->partialTrain($samples, $targets, json_decode($metadata['targetclasses'])); 157 } 158 159 $resultobj = new \stdClass(); 160 $resultobj->status = \core_analytics\model::OK; 161 $resultobj->info = array(); 162 163 // Store the trained model. 164 $modelmanager->saveToFile($classifier, $modelfilepath); 165 166 return $resultobj; 167 } 168 169 /** 170 * Classifies the provided dataset samples. 171 * 172 * @param string $uniqueid 173 * @param \stored_file $dataset 174 * @param string $outputdir 175 * @return \stdClass 176 */ 177 public function classify($uniqueid, \stored_file $dataset, $outputdir) { 178 179 $classifier = $this->load_classifier($outputdir); 180 181 $fh = $dataset->get_content_file_handle(); 182 183 // The first lines are var names and the second one values. 184 $metadata = $this->extract_metadata($fh); 185 186 // Skip headers. 187 fgets($fh); 188 189 $sampleids = array(); 190 $samples = array(); 191 $predictions = array(); 192 while (($data = fgetcsv($fh)) !== false) { 193 $sampledata = array_map('floatval', $data); 194 $sampleids[] = $data[0]; 195 $samples[] = array_slice($sampledata, 1, $metadata['nfeatures']); 196 197 if (count($samples) === self::BATCH_SIZE) { 198 // Prediction it batches to avoid running out of memory. 199 200 // Append predictions incrementally, we want $sampleids keys in sync with $predictions keys. 201 $newpredictions = $classifier->predict($samples); 202 foreach ($newpredictions as $prediction) { 203 array_push($predictions, $prediction); 204 } 205 $samples = array(); 206 } 207 } 208 fclose($fh); 209 210 // Finish the remaining predictions. 211 if ($samples) { 212 $predictions = $predictions + $classifier->predict($samples); 213 } 214 215 $resultobj = new \stdClass(); 216 $resultobj->status = \core_analytics\model::OK; 217 $resultobj->info = array(); 218 219 foreach ($predictions as $index => $prediction) { 220 $resultobj->predictions[$index] = array($sampleids[$index], $prediction); 221 } 222 223 return $resultobj; 224 } 225 226 /** 227 * Evaluates this processor classification model using the provided supervised learning dataset. 228 * 229 * During evaluation we need to shuffle the evaluation dataset samples to detect deviated results, 230 * if the dataset is massive we can not load everything into memory. We know that 2GB is the 231 * minimum memory limit we should have (\core_analytics\model::heavy_duty_mode), if we substract the memory 232 * that we already consumed and the memory that Phpml algorithms will need we should still have at 233 * least 500MB of memory, which should be enough to evaluate a model. In any case this is a robust 234 * solution that will work for all sites but it should minimize memory limit problems. Site admins 235 * can still set $CFG->mlbackend_php_no_evaluation_limits to true to skip this 500MB limit. 236 * 237 * @param string $uniqueid 238 * @param float $maxdeviation 239 * @param int $niterations 240 * @param \stored_file $dataset 241 * @param string $outputdir 242 * @param string $trainedmodeldir 243 * @return \stdClass 244 */ 245 public function evaluate_classification($uniqueid, $maxdeviation, $niterations, \stored_file $dataset, 246 $outputdir, $trainedmodeldir) { 247 $fh = $dataset->get_content_file_handle(); 248 249 if ($trainedmodeldir) { 250 // We overwrite the number of iterations as the results will always be the same. 251 $niterations = 1; 252 $classifier = $this->load_classifier($trainedmodeldir); 253 } 254 255 // The first lines are var names and the second one values. 256 $metadata = $this->extract_metadata($fh); 257 258 // Skip headers. 259 fgets($fh); 260 261 if (empty($CFG->mlbackend_php_no_evaluation_limits)) { 262 $samplessize = 0; 263 $limit = get_real_size('500MB'); 264 265 // Just an approximation, will depend on PHP version, compile options... 266 // Double size + zval struct (6 bytes + 8 bytes + 16 bytes) + array bucket (96 bytes) 267 // https://nikic.github.io/2011/12/12/How-big-are-PHP-arrays-really-Hint-BIG.html. 268 $floatsize = (PHP_INT_SIZE * 2) + 6 + 8 + 16 + 96; 269 } 270 271 $samples = array(); 272 $targets = array(); 273 while (($data = fgetcsv($fh)) !== false) { 274 $sampledata = array_map('floatval', $data); 275 276 $samples[] = array_slice($sampledata, 0, $metadata['nfeatures']); 277 $targets[] = intval($data[$metadata['nfeatures']]); 278 279 if (empty($CFG->mlbackend_php_no_evaluation_limits)) { 280 // We allow admins to disable evaluation memory usage limits by modifying config.php. 281 282 // We will have plenty of missing values in the dataset so it should be a conservative approximation. 283 $samplessize = $samplessize + (count($sampledata) * $floatsize); 284 285 // Stop fetching more samples. 286 if ($samplessize >= $limit) { 287 $this->limitedsize = true; 288 break; 289 } 290 } 291 } 292 fclose($fh); 293 294 // We need at least 2 samples belonging to each target. 295 $counts = array_count_values($targets); 296 $ntargets = count(explode(',', $metadata['targetclasses'])); 297 foreach ($counts as $count) { 298 if ($count < 2) { 299 $notenoughdata = true; 300 } 301 } 302 if ($ntargets > count($counts)) { 303 $notenoughdata = true; 304 } 305 if (!empty($notenoughdata)) { 306 $resultobj = new \stdClass(); 307 $resultobj->status = \core_analytics\model::NOT_ENOUGH_DATA; 308 $resultobj->score = 0; 309 $resultobj->info = array(get_string('errornotenoughdata', 'mlbackend_php')); 310 return $resultobj; 311 } 312 313 $scores = array(); 314 315 // Evaluate the model multiple times to confirm the results are not significantly random due to a short amount of data. 316 for ($i = 0; $i < $niterations; $i++) { 317 318 if (!$trainedmodeldir) { 319 $classifier = $this->instantiate_algorithm(); 320 321 // Split up the dataset in classifier and testing. 322 $data = new RandomSplit(new ArrayDataset($samples, $targets), 0.2); 323 324 $classifier->train($data->getTrainSamples(), $data->getTrainLabels()); 325 $predictedlabels = $classifier->predict($data->getTestSamples()); 326 $report = new ClassificationReport($data->getTestLabels(), $predictedlabels, 327 ClassificationReport::WEIGHTED_AVERAGE); 328 } else { 329 $predictedlabels = $classifier->predict($samples); 330 $report = new ClassificationReport($targets, $predictedlabels, 331 ClassificationReport::WEIGHTED_AVERAGE); 332 } 333 $averages = $report->getAverage(); 334 $scores[] = $averages['f1score']; 335 } 336 337 // Let's fill the results changing the returned status code depending on the phi-related calculated metrics. 338 return $this->get_evaluation_result_object($dataset, $scores, $maxdeviation); 339 } 340 341 /** 342 * Returns the results objects from all evaluations. 343 * 344 * @param \stored_file $dataset 345 * @param array $scores 346 * @param float $maxdeviation 347 * @return \stdClass 348 */ 349 protected function get_evaluation_result_object(\stored_file $dataset, $scores, $maxdeviation) { 350 351 // Average f1 score of all evaluations as final score. 352 if (count($scores) === 1) { 353 $avgscore = reset($scores); 354 } else { 355 $avgscore = \Phpml\Math\Statistic\Mean::arithmetic($scores); 356 } 357 358 // Standard deviation should ideally be calculated against the area under the curve. 359 if (count($scores) === 1) { 360 $modeldev = 0; 361 } else { 362 $modeldev = \Phpml\Math\Statistic\StandardDeviation::population($scores); 363 } 364 365 // Let's fill the results object. 366 $resultobj = new \stdClass(); 367 368 // Zero is ok, now we add other bits if something is not right. 369 $resultobj->status = \core_analytics\model::OK; 370 $resultobj->info = array(); 371 $resultobj->score = $avgscore; 372 373 // If each iteration results varied too much we need more data to confirm that this is a valid model. 374 if ($modeldev > $maxdeviation) { 375 $resultobj->status = $resultobj->status + \core_analytics\model::NOT_ENOUGH_DATA; 376 $a = new \stdClass(); 377 $a->deviation = $modeldev; 378 $a->accepteddeviation = $maxdeviation; 379 $resultobj->info[] = get_string('errornotenoughdatadev', 'mlbackend_php', $a); 380 } 381 382 if ($resultobj->score < \core_analytics\model::MIN_SCORE) { 383 $resultobj->status = $resultobj->status + \core_analytics\model::LOW_SCORE; 384 $a = new \stdClass(); 385 $a->score = $resultobj->score; 386 $a->minscore = \core_analytics\model::MIN_SCORE; 387 $resultobj->info[] = get_string('errorlowscore', 'mlbackend_php', $a); 388 } 389 390 if ($this->limitedsize === true) { 391 $resultobj->info[] = get_string('datasetsizelimited', 'mlbackend_php', display_size($dataset->get_filesize())); 392 } 393 394 return $resultobj; 395 } 396 397 /** 398 * Loads the pre-trained classifier. 399 * 400 * @throws \moodle_exception 401 * @param string $outputdir 402 * @return \Phpml\Classification\Linear\LogisticRegression 403 */ 404 protected function load_classifier($outputdir) { 405 $modelfilepath = $this->get_model_filepath($outputdir); 406 407 if (!file_exists($modelfilepath)) { 408 throw new \moodle_exception('errorcantloadmodel', 'mlbackend_php', '', $modelfilepath); 409 } 410 411 $modelmanager = new ModelManager(); 412 return $modelmanager->restoreFromFile($modelfilepath); 413 } 414 415 /** 416 * Train this processor regression model using the provided supervised learning dataset. 417 * 418 * @throws new \coding_exception 419 * @param string $uniqueid 420 * @param \stored_file $dataset 421 * @param string $outputdir 422 * @return \stdClass 423 */ 424 public function train_regression($uniqueid, \stored_file $dataset, $outputdir) { 425 throw new \coding_exception('This predictor does not support regression yet.'); 426 } 427 428 /** 429 * Estimates linear values for the provided dataset samples. 430 * 431 * @throws new \coding_exception 432 * @param string $uniqueid 433 * @param \stored_file $dataset 434 * @param mixed $outputdir 435 * @return void 436 */ 437 public function estimate($uniqueid, \stored_file $dataset, $outputdir) { 438 throw new \coding_exception('This predictor does not support regression yet.'); 439 } 440 441 /** 442 * Evaluates this processor regression model using the provided supervised learning dataset. 443 * 444 * @throws new \coding_exception 445 * @param string $uniqueid 446 * @param float $maxdeviation 447 * @param int $niterations 448 * @param \stored_file $dataset 449 * @param string $outputdir 450 * @param string $trainedmodeldir 451 * @return \stdClass 452 */ 453 public function evaluate_regression($uniqueid, $maxdeviation, $niterations, \stored_file $dataset, 454 $outputdir, $trainedmodeldir) { 455 throw new \coding_exception('This predictor does not support regression yet.'); 456 } 457 458 /** 459 * Exports the machine learning model. 460 * 461 * @throws \moodle_exception 462 * @param string $uniqueid The model unique id 463 * @param string $modeldir The directory that contains the trained model. 464 * @return string The path to the directory that contains the exported model. 465 */ 466 public function export(string $uniqueid, string $modeldir) : string { 467 468 $modelfilepath = $this->get_model_filepath($modeldir); 469 470 if (!file_exists($modelfilepath)) { 471 throw new \moodle_exception('errorexportmodelresult', 'analytics'); 472 } 473 474 // We can use the actual $modeldir as the directory is not modified during export, just copied into a zip. 475 return $modeldir; 476 } 477 478 /** 479 * Imports the provided machine learning model. 480 * 481 * @param string $uniqueid The model unique id 482 * @param string $modeldir The directory that will contain the trained model. 483 * @param string $importdir The directory that contains the files to import. 484 * @return bool Success 485 */ 486 public function import(string $uniqueid, string $modeldir, string $importdir) : bool { 487 488 $importmodelfilepath = $this->get_model_filepath($importdir); 489 $modelfilepath = $this->get_model_filepath($modeldir); 490 491 $modelmanager = new ModelManager(); 492 493 // Copied from ModelManager::restoreFromFile to validate the serialised contents 494 // before restoring them. 495 $importconfig = file_get_contents($importmodelfilepath); 496 497 // Clean stuff like function calls. 498 $importconfig = preg_replace('/[^a-zA-Z0-9\{\}%\.\*\;\,\:\"\-\0\\\]/', '', $importconfig); 499 500 $object = unserialize($importconfig, 501 ['allowed_classes' => ['Phpml\\Classification\\Linear\\LogisticRegression']]); 502 if (!$object) { 503 return false; 504 } 505 506 if (get_class($object) == '__PHP_Incomplete_Class') { 507 return false; 508 } 509 510 $classifier = $modelmanager->restoreFromFile($importmodelfilepath); 511 512 // This would override any previous classifier. 513 $modelmanager->saveToFile($classifier, $modelfilepath); 514 515 return true; 516 } 517 518 /** 519 * Returns the path to the serialised model file in the provided directory. 520 * 521 * @param string $modeldir The model directory 522 * @return string The model file 523 */ 524 protected function get_model_filepath(string $modeldir) : string { 525 // Output directory is already unique to the model. 526 return $modeldir . DIRECTORY_SEPARATOR . self::MODEL_FILENAME; 527 } 528 529 /** 530 * Extracts metadata from the dataset file. 531 * 532 * The file poiter should be located at the top of the file. 533 * 534 * @param resource $fh 535 * @return array 536 */ 537 protected function extract_metadata($fh) { 538 $metadata = fgetcsv($fh); 539 return array_combine($metadata, fgetcsv($fh)); 540 } 541 542 /** 543 * Instantiates the ML algorithm. 544 * 545 * @return \Phpml\Classification\Linear\LogisticRegression 546 */ 547 protected function instantiate_algorithm(): \Phpml\Classification\Linear\LogisticRegression { 548 return new LogisticRegression(self::TRAIN_ITERATIONS, true, 549 LogisticRegression::CONJUGATE_GRAD_TRAINING, 'log'); 550 } 551} 552