1<?php
2// This file is part of Moodle - http://moodle.org/
3//
4// Moodle is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// Moodle is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with Moodle.  If not, see <http://www.gnu.org/licenses/>.
16
17/**
18 * Php predictions processor
19 *
20 * @package   mlbackend_php
21 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
22 * @license   http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
23 */
24
25namespace mlbackend_php;
26
27defined('MOODLE_INTERNAL') || die();
28
29use Phpml\Preprocessing\Normalizer;
30use Phpml\CrossValidation\RandomSplit;
31use Phpml\Dataset\ArrayDataset;
32use Phpml\ModelManager;
33use Phpml\Classification\Linear\LogisticRegression;
34use Phpml\Metric\ClassificationReport;
35
36/**
37 * PHP predictions processor.
38 *
39 * @package   mlbackend_php
40 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
41 * @license   http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
42 */
43class processor implements \core_analytics\classifier, \core_analytics\regressor, \core_analytics\packable {
44
45    /**
46     * Size of training / prediction batches.
47     */
48    const BATCH_SIZE = 5000;
49
50    /**
51     * Number of train iterations.
52     */
53    const TRAIN_ITERATIONS = 500;
54
55    /**
56     * File name of the serialised model.
57     */
58    const MODEL_FILENAME = 'model.ser';
59
60    /**
61     * @var bool
62     */
63    protected $limitedsize = false;
64
65    /**
66     * Checks if the processor is ready to use.
67     *
68     * @return bool
69     */
70    public function is_ready() {
71        if (version_compare(phpversion(), '7.0.0') < 0) {
72            return get_string('errorphp7required', 'mlbackend_php');
73        }
74        return true;
75    }
76
77    /**
78     * Delete the stored models.
79     *
80     * @param string $uniqueid
81     * @param string $modelversionoutputdir
82     * @return null
83     */
84    public function clear_model($uniqueid, $modelversionoutputdir) {
85        remove_dir($modelversionoutputdir);
86    }
87
88    /**
89     * Delete the output directory.
90     *
91     * @param string $modeloutputdir
92     * @param string $uniqueid
93     * @return null
94     */
95    public function delete_output_dir($modeloutputdir, $uniqueid) {
96        remove_dir($modeloutputdir);
97    }
98
99    /**
100     * Train this processor classification model using the provided supervised learning dataset.
101     *
102     * @param string $uniqueid
103     * @param \stored_file $dataset
104     * @param string $outputdir
105     * @return \stdClass
106     */
107    public function train_classification($uniqueid, \stored_file $dataset, $outputdir) {
108
109        $modelfilepath = $this->get_model_filepath($outputdir);
110
111        $modelmanager = new ModelManager();
112
113        if (file_exists($modelfilepath)) {
114            $classifier = $modelmanager->restoreFromFile($modelfilepath);
115        } else {
116            $classifier = $this->instantiate_algorithm();
117        }
118
119        $fh = $dataset->get_content_file_handle();
120
121        // The first lines are var names and the second one values.
122        $metadata = $this->extract_metadata($fh);
123
124        // Skip headers.
125        fgets($fh);
126
127        $samples = array();
128        $targets = array();
129        while (($data = fgetcsv($fh)) !== false) {
130            $sampledata = array_map('floatval', $data);
131            $samples[] = array_slice($sampledata, 0, $metadata['nfeatures']);
132            $targets[] = intval($data[$metadata['nfeatures']]);
133
134            $nsamples = count($samples);
135            if ($nsamples === self::BATCH_SIZE) {
136                // Training it batches to avoid running out of memory.
137                $classifier->partialTrain($samples, $targets, json_decode($metadata['targetclasses']));
138                $samples = array();
139                $targets = array();
140            }
141            if (empty($morethan1sample) && $nsamples > 1) {
142                $morethan1sample = true;
143            }
144        }
145        fclose($fh);
146
147        if (empty($morethan1sample)) {
148            $resultobj = new \stdClass();
149            $resultobj->status = \core_analytics\model::NO_DATASET;
150            $resultobj->info = array();
151            return $resultobj;
152        }
153
154        // Train the remaining samples.
155        if ($samples) {
156            $classifier->partialTrain($samples, $targets, json_decode($metadata['targetclasses']));
157        }
158
159        $resultobj = new \stdClass();
160        $resultobj->status = \core_analytics\model::OK;
161        $resultobj->info = array();
162
163        // Store the trained model.
164        $modelmanager->saveToFile($classifier, $modelfilepath);
165
166        return $resultobj;
167    }
168
169    /**
170     * Classifies the provided dataset samples.
171     *
172     * @param string $uniqueid
173     * @param \stored_file $dataset
174     * @param string $outputdir
175     * @return \stdClass
176     */
177    public function classify($uniqueid, \stored_file $dataset, $outputdir) {
178
179        $classifier = $this->load_classifier($outputdir);
180
181        $fh = $dataset->get_content_file_handle();
182
183        // The first lines are var names and the second one values.
184        $metadata = $this->extract_metadata($fh);
185
186        // Skip headers.
187        fgets($fh);
188
189        $sampleids = array();
190        $samples = array();
191        $predictions = array();
192        while (($data = fgetcsv($fh)) !== false) {
193            $sampledata = array_map('floatval', $data);
194            $sampleids[] = $data[0];
195            $samples[] = array_slice($sampledata, 1, $metadata['nfeatures']);
196
197            if (count($samples) === self::BATCH_SIZE) {
198                // Prediction it batches to avoid running out of memory.
199
200                // Append predictions incrementally, we want $sampleids keys in sync with $predictions keys.
201                $newpredictions = $classifier->predict($samples);
202                foreach ($newpredictions as $prediction) {
203                    array_push($predictions, $prediction);
204                }
205                $samples = array();
206            }
207        }
208        fclose($fh);
209
210        // Finish the remaining predictions.
211        if ($samples) {
212            $predictions = $predictions + $classifier->predict($samples);
213        }
214
215        $resultobj = new \stdClass();
216        $resultobj->status = \core_analytics\model::OK;
217        $resultobj->info = array();
218
219        foreach ($predictions as $index => $prediction) {
220            $resultobj->predictions[$index] = array($sampleids[$index], $prediction);
221        }
222
223        return $resultobj;
224    }
225
226    /**
227     * Evaluates this processor classification model using the provided supervised learning dataset.
228     *
229     * During evaluation we need to shuffle the evaluation dataset samples to detect deviated results,
230     * if the dataset is massive we can not load everything into memory. We know that 2GB is the
231     * minimum memory limit we should have (\core_analytics\model::heavy_duty_mode), if we substract the memory
232     * that we already consumed and the memory that Phpml algorithms will need we should still have at
233     * least 500MB of memory, which should be enough to evaluate a model. In any case this is a robust
234     * solution that will work for all sites but it should minimize memory limit problems. Site admins
235     * can still set $CFG->mlbackend_php_no_evaluation_limits to true to skip this 500MB limit.
236     *
237     * @param string $uniqueid
238     * @param float $maxdeviation
239     * @param int $niterations
240     * @param \stored_file $dataset
241     * @param string $outputdir
242     * @param  string $trainedmodeldir
243     * @return \stdClass
244     */
245    public function evaluate_classification($uniqueid, $maxdeviation, $niterations, \stored_file $dataset,
246            $outputdir, $trainedmodeldir) {
247        $fh = $dataset->get_content_file_handle();
248
249        if ($trainedmodeldir) {
250            // We overwrite the number of iterations as the results will always be the same.
251            $niterations = 1;
252            $classifier = $this->load_classifier($trainedmodeldir);
253        }
254
255        // The first lines are var names and the second one values.
256        $metadata = $this->extract_metadata($fh);
257
258        // Skip headers.
259        fgets($fh);
260
261        if (empty($CFG->mlbackend_php_no_evaluation_limits)) {
262            $samplessize = 0;
263            $limit = get_real_size('500MB');
264
265            // Just an approximation, will depend on PHP version, compile options...
266            // Double size + zval struct (6 bytes + 8 bytes + 16 bytes) + array bucket (96 bytes)
267            // https://nikic.github.io/2011/12/12/How-big-are-PHP-arrays-really-Hint-BIG.html.
268            $floatsize = (PHP_INT_SIZE * 2) + 6 + 8 + 16 + 96;
269        }
270
271        $samples = array();
272        $targets = array();
273        while (($data = fgetcsv($fh)) !== false) {
274            $sampledata = array_map('floatval', $data);
275
276            $samples[] = array_slice($sampledata, 0, $metadata['nfeatures']);
277            $targets[] = intval($data[$metadata['nfeatures']]);
278
279            if (empty($CFG->mlbackend_php_no_evaluation_limits)) {
280                // We allow admins to disable evaluation memory usage limits by modifying config.php.
281
282                // We will have plenty of missing values in the dataset so it should be a conservative approximation.
283                $samplessize = $samplessize + (count($sampledata) * $floatsize);
284
285                // Stop fetching more samples.
286                if ($samplessize >= $limit) {
287                    $this->limitedsize = true;
288                    break;
289                }
290            }
291        }
292        fclose($fh);
293
294        // We need at least 2 samples belonging to each target.
295        $counts = array_count_values($targets);
296        $ntargets = count(explode(',', $metadata['targetclasses']));
297        foreach ($counts as $count) {
298            if ($count < 2) {
299                $notenoughdata = true;
300            }
301        }
302        if ($ntargets > count($counts)) {
303            $notenoughdata = true;
304        }
305        if (!empty($notenoughdata)) {
306            $resultobj = new \stdClass();
307            $resultobj->status = \core_analytics\model::NOT_ENOUGH_DATA;
308            $resultobj->score = 0;
309            $resultobj->info = array(get_string('errornotenoughdata', 'mlbackend_php'));
310            return $resultobj;
311        }
312
313        $scores = array();
314
315        // Evaluate the model multiple times to confirm the results are not significantly random due to a short amount of data.
316        for ($i = 0; $i < $niterations; $i++) {
317
318            if (!$trainedmodeldir) {
319                $classifier = $this->instantiate_algorithm();
320
321                // Split up the dataset in classifier and testing.
322                $data = new RandomSplit(new ArrayDataset($samples, $targets), 0.2);
323
324                $classifier->train($data->getTrainSamples(), $data->getTrainLabels());
325                $predictedlabels = $classifier->predict($data->getTestSamples());
326                $report = new ClassificationReport($data->getTestLabels(), $predictedlabels,
327                    ClassificationReport::WEIGHTED_AVERAGE);
328            } else {
329                $predictedlabels = $classifier->predict($samples);
330                $report = new ClassificationReport($targets, $predictedlabels,
331                    ClassificationReport::WEIGHTED_AVERAGE);
332            }
333            $averages = $report->getAverage();
334            $scores[] = $averages['f1score'];
335        }
336
337        // Let's fill the results changing the returned status code depending on the phi-related calculated metrics.
338        return $this->get_evaluation_result_object($dataset, $scores, $maxdeviation);
339    }
340
341    /**
342     * Returns the results objects from all evaluations.
343     *
344     * @param \stored_file $dataset
345     * @param array $scores
346     * @param float $maxdeviation
347     * @return \stdClass
348     */
349    protected function get_evaluation_result_object(\stored_file $dataset, $scores, $maxdeviation) {
350
351        // Average f1 score of all evaluations as final score.
352        if (count($scores) === 1) {
353            $avgscore = reset($scores);
354        } else {
355            $avgscore = \Phpml\Math\Statistic\Mean::arithmetic($scores);
356        }
357
358        // Standard deviation should ideally be calculated against the area under the curve.
359        if (count($scores) === 1) {
360            $modeldev = 0;
361        } else {
362            $modeldev = \Phpml\Math\Statistic\StandardDeviation::population($scores);
363        }
364
365        // Let's fill the results object.
366        $resultobj = new \stdClass();
367
368        // Zero is ok, now we add other bits if something is not right.
369        $resultobj->status = \core_analytics\model::OK;
370        $resultobj->info = array();
371        $resultobj->score = $avgscore;
372
373        // If each iteration results varied too much we need more data to confirm that this is a valid model.
374        if ($modeldev > $maxdeviation) {
375            $resultobj->status = $resultobj->status + \core_analytics\model::NOT_ENOUGH_DATA;
376            $a = new \stdClass();
377            $a->deviation = $modeldev;
378            $a->accepteddeviation = $maxdeviation;
379            $resultobj->info[] = get_string('errornotenoughdatadev', 'mlbackend_php', $a);
380        }
381
382        if ($resultobj->score < \core_analytics\model::MIN_SCORE) {
383            $resultobj->status = $resultobj->status + \core_analytics\model::LOW_SCORE;
384            $a = new \stdClass();
385            $a->score = $resultobj->score;
386            $a->minscore = \core_analytics\model::MIN_SCORE;
387            $resultobj->info[] = get_string('errorlowscore', 'mlbackend_php', $a);
388        }
389
390        if ($this->limitedsize === true) {
391            $resultobj->info[] = get_string('datasetsizelimited', 'mlbackend_php', display_size($dataset->get_filesize()));
392        }
393
394        return $resultobj;
395    }
396
397    /**
398     * Loads the pre-trained classifier.
399     *
400     * @throws \moodle_exception
401     * @param string $outputdir
402     * @return \Phpml\Classification\Linear\LogisticRegression
403     */
404    protected function load_classifier($outputdir) {
405        $modelfilepath = $this->get_model_filepath($outputdir);
406
407        if (!file_exists($modelfilepath)) {
408            throw new \moodle_exception('errorcantloadmodel', 'mlbackend_php', '', $modelfilepath);
409        }
410
411        $modelmanager = new ModelManager();
412        return $modelmanager->restoreFromFile($modelfilepath);
413    }
414
415    /**
416     * Train this processor regression model using the provided supervised learning dataset.
417     *
418     * @throws new \coding_exception
419     * @param string $uniqueid
420     * @param \stored_file $dataset
421     * @param string $outputdir
422     * @return \stdClass
423     */
424    public function train_regression($uniqueid, \stored_file $dataset, $outputdir) {
425        throw new \coding_exception('This predictor does not support regression yet.');
426    }
427
428    /**
429     * Estimates linear values for the provided dataset samples.
430     *
431     * @throws new \coding_exception
432     * @param string $uniqueid
433     * @param \stored_file $dataset
434     * @param mixed $outputdir
435     * @return void
436     */
437    public function estimate($uniqueid, \stored_file $dataset, $outputdir) {
438        throw new \coding_exception('This predictor does not support regression yet.');
439    }
440
441    /**
442     * Evaluates this processor regression model using the provided supervised learning dataset.
443     *
444     * @throws new \coding_exception
445     * @param string $uniqueid
446     * @param float $maxdeviation
447     * @param int $niterations
448     * @param \stored_file $dataset
449     * @param string $outputdir
450     * @param  string $trainedmodeldir
451     * @return \stdClass
452     */
453    public function evaluate_regression($uniqueid, $maxdeviation, $niterations, \stored_file $dataset,
454            $outputdir, $trainedmodeldir) {
455        throw new \coding_exception('This predictor does not support regression yet.');
456    }
457
458    /**
459     * Exports the machine learning model.
460     *
461     * @throws \moodle_exception
462     * @param  string $uniqueid  The model unique id
463     * @param  string $modeldir  The directory that contains the trained model.
464     * @return string            The path to the directory that contains the exported model.
465     */
466    public function export(string $uniqueid, string $modeldir) : string {
467
468        $modelfilepath = $this->get_model_filepath($modeldir);
469
470        if (!file_exists($modelfilepath)) {
471            throw new \moodle_exception('errorexportmodelresult', 'analytics');
472        }
473
474        // We can use the actual $modeldir as the directory is not modified during export, just copied into a zip.
475        return $modeldir;
476    }
477
478    /**
479     * Imports the provided machine learning model.
480     *
481     * @param  string $uniqueid The model unique id
482     * @param  string $modeldir  The directory that will contain the trained model.
483     * @param  string $importdir The directory that contains the files to import.
484     * @return bool Success
485     */
486    public function import(string $uniqueid, string $modeldir, string $importdir) : bool {
487
488        $importmodelfilepath = $this->get_model_filepath($importdir);
489        $modelfilepath = $this->get_model_filepath($modeldir);
490
491        $modelmanager = new ModelManager();
492
493        // Copied from ModelManager::restoreFromFile to validate the serialised contents
494        // before restoring them.
495        $importconfig = file_get_contents($importmodelfilepath);
496
497        // Clean stuff like function calls.
498        $importconfig = preg_replace('/[^a-zA-Z0-9\{\}%\.\*\;\,\:\"\-\0\\\]/', '', $importconfig);
499
500        $object = unserialize($importconfig,
501            ['allowed_classes' => ['Phpml\\Classification\\Linear\\LogisticRegression']]);
502        if (!$object) {
503            return false;
504        }
505
506        if (get_class($object) == '__PHP_Incomplete_Class') {
507            return false;
508        }
509
510        $classifier = $modelmanager->restoreFromFile($importmodelfilepath);
511
512        // This would override any previous classifier.
513        $modelmanager->saveToFile($classifier, $modelfilepath);
514
515        return true;
516    }
517
518    /**
519     * Returns the path to the serialised model file in the provided directory.
520     *
521     * @param  string $modeldir The model directory
522     * @return string           The model file
523     */
524    protected function get_model_filepath(string $modeldir) : string {
525        // Output directory is already unique to the model.
526        return $modeldir . DIRECTORY_SEPARATOR . self::MODEL_FILENAME;
527    }
528
529    /**
530     * Extracts metadata from the dataset file.
531     *
532     * The file poiter should be located at the top of the file.
533     *
534     * @param resource $fh
535     * @return array
536     */
537    protected function extract_metadata($fh) {
538        $metadata = fgetcsv($fh);
539        return array_combine($metadata, fgetcsv($fh));
540    }
541
542    /**
543     * Instantiates the ML algorithm.
544     *
545     * @return \Phpml\Classification\Linear\LogisticRegression
546     */
547    protected function instantiate_algorithm(): \Phpml\Classification\Linear\LogisticRegression {
548        return new LogisticRegression(self::TRAIN_ITERATIONS, true,
549            LogisticRegression::CONJUGATE_GRAD_TRAINING, 'log');
550    }
551}
552