1<?php
2// This file is part of Moodle - http://moodle.org/
3//
4// Moodle is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// Moodle is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with Moodle.  If not, see <http://www.gnu.org/licenses/>.
16
17/**
18 * Datasets manager.
19 *
20 * @package   core_analytics
21 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
22 * @license   http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
23 */
24
25namespace core_analytics;
26
27defined('MOODLE_INTERNAL') || die();
28
29/**
30 * Datasets manager.
31 *
32 * @package   core_analytics
33 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
34 * @license   http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
35 */
36class dataset_manager {
37
38    /**
39     * File area for labelled datasets.
40     */
41    const LABELLED_FILEAREA = 'labelled';
42
43    /**
44     * File area for unlabelled datasets.
45     */
46    const UNLABELLED_FILEAREA = 'unlabelled';
47
48    /**
49     * File area for exported datasets.
50     */
51    const EXPORT_FILEAREA = 'export';
52
53    /**
54     * Evaluation file file name.
55     */
56    const EVALUATION_FILENAME = 'evaluation.csv';
57
58    /**
59     * The model id.
60     *
61     * @var int
62     */
63    protected $modelid;
64
65    /**
66     * Range processor in use.
67     *
68     * @var string
69     */
70    protected $timesplittingid;
71
72    /**
73     * @var int
74     */
75    protected $analysableid;
76
77    /**
78     * Whether this is a dataset for evaluation or not.
79     *
80     * @var bool
81     */
82    protected $evaluation;
83
84    /**
85     * The dataset filearea. Must be one of the self::*_FILEAREA options.
86     *
87     * @var string
88     */
89    protected $filearea;
90
91    /**
92     * Constructor method.
93     *
94     * @throws \coding_exception
95     * @param int $modelid
96     * @param int $analysableid
97     * @param string $timesplittingid
98     * @param string $filearea
99     * @param bool $evaluation
100     * @return void
101     */
102    public function __construct($modelid, $analysableid, $timesplittingid, $filearea, $evaluation = false) {
103
104        if ($filearea !== self::EXPORT_FILEAREA && $filearea !== self::LABELLED_FILEAREA &&
105                $filearea !== self::UNLABELLED_FILEAREA) {
106            throw new \coding_exception('Invalid provided filearea');
107        }
108
109        $this->modelid = $modelid;
110        $this->analysableid = $analysableid;
111        $this->timesplittingid = $timesplittingid;
112        $this->filearea = $filearea;
113        $this->evaluation = $evaluation;
114    }
115
116    /**
117     * Store the dataset in the internal file system.
118     *
119     * @param array $data
120     * @return \stored_file
121     */
122    public function store($data) {
123
124        // Delete previous file if it exists.
125        $fs = get_file_storage();
126
127        $filerecord = [
128            'component' => 'analytics',
129            'filearea' => $this->filearea,
130            'itemid' => $this->modelid,
131            'contextid' => \context_system::instance()->id,
132            'filepath' => '/analysable/' . $this->analysableid . '/' .
133                \core_analytics\analysis::clean_time_splitting_id($this->timesplittingid) . '/',
134            'filename' => self::get_filename($this->evaluation)
135        ];
136
137        // Delete previous and old (we already checked that previous copies are not recent) evaluation files for this analysable.
138        if ($this->evaluation) {
139            $select = " = {$filerecord['itemid']} AND filepath = :filepath";
140            $fs->delete_area_files_select($filerecord['contextid'], $filerecord['component'], $filerecord['filearea'],
141                $select, array('filepath' => $filerecord['filepath']));
142        }
143
144        // Write all this stuff to a tmp file.
145        $filepath = make_request_directory() . DIRECTORY_SEPARATOR . $filerecord['filename'];
146        $fh = fopen($filepath, 'w+');
147        if (!$fh) {
148            return false;
149        }
150        foreach ($data as $line) {
151            fputcsv($fh, $line);
152        }
153        fclose($fh);
154
155        return $fs->create_file_from_pathname($filerecord, $filepath);
156    }
157
158    /**
159     * Returns the previous evaluation file.
160     *
161     * Important to note that this is per modelid + timesplittingid, when dealing with multiple
162     * analysables this is the merged file. Do not confuse with self::get_evaluation_analysable_file
163     *
164     * @param int $modelid
165     * @param string $timesplittingid
166     * @return \stored_file
167     */
168    public static function get_previous_evaluation_file($modelid, $timesplittingid) {
169        $fs = get_file_storage();
170        // Evaluation data is always labelled.
171        $filepath = '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/';
172        return $fs->get_file(\context_system::instance()->id, 'analytics', self::LABELLED_FILEAREA, $modelid,
173            $filepath, self::EVALUATION_FILENAME);
174    }
175
176    /**
177     * Gets the list of files that couldn't be previously used for training and prediction.
178     *
179     * @param int $modelid
180     * @param bool $includetarget
181     * @param string[] $timesplittingids
182     * @return null
183     */
184    public static function get_pending_files($modelid, $includetarget, $timesplittingids) {
185        global $DB;
186
187        $fs = get_file_storage();
188
189        if ($includetarget) {
190            $filearea = self::LABELLED_FILEAREA;
191            $usedfileaction = 'trained';
192        } else {
193            $filearea = self::UNLABELLED_FILEAREA;
194            $usedfileaction = 'predicted';
195        }
196
197        $select = 'modelid = :modelid AND action = :action';
198        $params = array('modelid' => $modelid, 'action' => $usedfileaction);
199        $usedfileids = $DB->get_fieldset_select('analytics_used_files', 'fileid', $select, $params);
200
201        // Very likely that we will only have 1 time splitting method here.
202        $filesbytimesplitting = array();
203        foreach ($timesplittingids as $timesplittingid) {
204
205            $filepath = '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/';
206            $files = $fs->get_directory_files(\context_system::instance()->id, 'analytics', $filearea, $modelid, $filepath);
207            foreach ($files as $file) {
208
209                // Discard evaluation files.
210                if ($file->get_filename() === self::EVALUATION_FILENAME) {
211                    continue;
212                }
213
214                // No dirs.
215                if ($file->is_directory()) {
216                    continue;
217                }
218
219                // Already used for training.
220                if (in_array($file->get_id(), $usedfileids)) {
221                    continue;
222                }
223
224                $filesbytimesplitting[$timesplittingid][] = $file;
225            }
226        }
227
228        return $filesbytimesplitting;
229    }
230
231    /**
232     * Deletes previous evaluation files of this model.
233     *
234     * @param int $modelid
235     * @param string $timesplittingid
236     * @return bool
237     */
238    public static function delete_previous_evaluation_file($modelid, $timesplittingid) {
239        if ($file = self::get_previous_evaluation_file($modelid, $timesplittingid)) {
240            $file->delete();
241            return true;
242        }
243
244        return false;
245    }
246
247    /**
248     * Returns this (model + analysable + time splitting) file.
249     *
250     * @param int $modelid
251     * @param int $analysableid
252     * @param string $timesplittingid
253     * @return \stored_file
254     */
255    public static function get_evaluation_analysable_file($modelid, $analysableid, $timesplittingid) {
256
257        // Delete previous file if it exists.
258        $fs = get_file_storage();
259
260        // Always evaluation.csv and labelled as it is an evaluation file.
261        $filearea = self::LABELLED_FILEAREA;
262        $filename = self::get_filename(true);
263        $filepath = '/analysable/' . $analysableid . '/' .
264            \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/';
265        return $fs->get_file(\context_system::instance()->id, 'analytics', $filearea, $modelid, $filepath, $filename);
266    }
267
268    /**
269     * Merge multiple files into one.
270     *
271     * Important! It is the caller responsability to ensure that the datasets are compatible.
272     *
273     * @param array  $files
274     * @param int    $modelid
275     * @param string $timesplittingid
276     * @param string $filearea
277     * @param bool   $evaluation
278     * @return \stored_file
279     */
280    public static function merge_datasets(array $files, $modelid, $timesplittingid, $filearea, $evaluation = false) {
281
282        $tmpfilepath = make_request_directory() . DIRECTORY_SEPARATOR . 'tmpfile.csv';
283
284        // Add headers.
285        // We could also do this with a single iteration gathering all files headers and appending them to the beginning of the file
286        // once all file contents are merged.
287        $varnames = '';
288        $analysablesvalues = array();
289        foreach ($files as $file) {
290            $rh = $file->get_content_file_handle();
291
292            // Copy the var names as they are, all files should have the same var names.
293            $varnames = fgetcsv($rh);
294
295            $analysablesvalues[] = fgetcsv($rh);
296
297            // Copy the columns as they are, all files should have the same columns.
298            $columns = fgetcsv($rh);
299        }
300
301        // Merge analysable values skipping the ones that are the same in all analysables.
302        $values = array();
303        foreach ($analysablesvalues as $analysablevalues) {
304            foreach ($analysablevalues as $varkey => $value) {
305                // Sha1 to make it unique.
306                $values[$varkey][sha1($value)] = $value;
307            }
308        }
309        foreach ($values as $varkey => $varvalues) {
310            $values[$varkey] = implode('|', $varvalues);
311        }
312
313        // Start writing to the merge file.
314        $wh = fopen($tmpfilepath, 'w');
315        if (!$wh) {
316            throw new \moodle_exception('errorcannotwritedataset', 'analytics', '', $tmpfilepath);
317        }
318
319        fputcsv($wh, $varnames);
320        fputcsv($wh, $values);
321        fputcsv($wh, $columns);
322
323        // Iterate through all files and add them to the tmp one. We don't want file contents in memory.
324        foreach ($files as $file) {
325            $rh = $file->get_content_file_handle();
326
327            // Skip headers.
328            fgets($rh);
329            fgets($rh);
330            fgets($rh);
331
332            // Copy all the following lines.
333            while ($line = fgets($rh)) {
334                fwrite($wh, $line);
335            }
336            fclose($rh);
337        }
338        fclose($wh);
339
340        $filerecord = [
341            'component' => 'analytics',
342            'filearea' => $filearea,
343            'itemid' => $modelid,
344            'contextid' => \context_system::instance()->id,
345            'filepath' => '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/',
346            'filename' => self::get_filename($evaluation)
347        ];
348
349        $fs = get_file_storage();
350
351        return $fs->create_file_from_pathname($filerecord, $tmpfilepath);
352    }
353
354    /**
355     * Exports the model training data.
356     *
357     * @param int $modelid
358     * @param string $timesplittingid
359     * @return \stored_file|false
360     */
361    public static function export_training_data($modelid, $timesplittingid) {
362
363        $fs = get_file_storage();
364
365        $contextid = \context_system::instance()->id;
366        $filepath = '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/';
367
368        $files = $fs->get_directory_files($contextid, 'analytics', self::LABELLED_FILEAREA, $modelid,
369            $filepath, true, false);
370
371        // Discard evaluation files.
372        foreach ($files as $key => $file) {
373            if ($file->get_filename() === self::EVALUATION_FILENAME) {
374                unset($files[$key]);
375            }
376        }
377
378        if (empty($files)) {
379            return false;
380        }
381
382        return self::merge_datasets($files, $modelid, $timesplittingid, self::EXPORT_FILEAREA);
383    }
384
385    /**
386     * Returns the dataset file data structured by sampleids using the indicators and target column names.
387     *
388     * @param \stored_file $dataset
389     * @return array
390     */
391    public static function get_structured_data(\stored_file $dataset) {
392
393        if ($dataset->get_filearea() !== 'unlabelled') {
394            throw new \coding_exception('Sorry, only support for unlabelled data');
395        }
396
397        $rh = $dataset->get_content_file_handle();
398
399        // Skip dataset info.
400        fgets($rh);
401        fgets($rh);
402
403        $calculations = array();
404
405        $headers = fgetcsv($rh);
406        // Get rid of the sampleid column name.
407        array_shift($headers);
408
409        while ($columns = fgetcsv($rh)) {
410            $uniquesampleid = array_shift($columns);
411
412            // Unfortunately fgetcsv does not respect line's var types.
413            $calculations[$uniquesampleid] = array_map(function($value) {
414
415                if ($value === '') {
416                    // We really want them as null because converted to float become 0
417                    // and we need to treat the values separately.
418                    return null;
419                } else if (is_numeric($value)) {
420                    return floatval($value);
421                }
422                return $value;
423            }, array_combine($headers, $columns));
424        }
425
426        return $calculations;
427    }
428
429    /**
430     * Delete all files of a model.
431     *
432     * @param int $modelid
433     * @return bool
434     */
435    public static function clear_model_files($modelid) {
436        $fs = get_file_storage();
437        return $fs->delete_area_files(\context_system::instance()->id, 'analytics', false, $modelid);
438    }
439
440    /**
441     * Returns the file name to be used.
442     *
443     * @param strinbool $evaluation
444     * @return string
445     */
446    protected static function get_filename($evaluation) {
447
448        if ($evaluation === true) {
449            $filename = self::EVALUATION_FILENAME;
450        } else {
451            // Incremental time, the lock will make sure we don't have concurrency problems.
452            $filename = microtime(true) . '.csv';
453        }
454
455        return $filename;
456    }
457}
458