1<?php
2// This file is part of Moodle - http://moodle.org/
3//
4// Moodle is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// Moodle is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with Moodle.  If not, see <http://www.gnu.org/licenses/>.
16
17/**
18 * Runs an analysis of the site.
19 *
20 * @package   core_analytics
21 * @copyright 2019 David Monllao {@link http://www.davidmonllao.com}
22 * @license   http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
23 */
24
25namespace core_analytics;
26
27defined('MOODLE_INTERNAL') || die();
28
29/**
30 * Runs an analysis of the site.
31 *
32 * @package   core_analytics
33 * @copyright 2019 David Monllao {@link http://www.davidmonllao.com}
34 * @license   http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
35 */
36class analysis {
37
38    /**
39     * @var \core_analytics\local\analyser\base
40     */
41    private $analyser;
42
43    /**
44     * @var bool Whether to calculate the target or not in this run.
45     */
46    private $includetarget;
47
48    /**
49     * @var \core_analytics\local\analysis\result
50     */
51    private $result;
52
53    /**
54     * @var \core\lock\lock
55     */
56    private $lock;
57
58    /**
59     * Constructor.
60     *
61     * @param \core_analytics\local\analyser\base   $analyser
62     * @param bool                                  $includetarget Whether to calculate the target or not.
63     * @param \core_analytics\local\analysis\result $result
64     */
65    public function __construct(\core_analytics\local\analyser\base $analyser, bool $includetarget,
66            \core_analytics\local\analysis\result $result) {
67        $this->analyser = $analyser;
68        $this->includetarget = $includetarget;
69        $this->result = $result;
70
71        // We cache the first time analysables were analysed because time-splitting methods can depend on these info.
72        self::fill_firstanalyses_cache($this->analyser->get_modelid());
73    }
74
75    /**
76     * Runs the analysis.
77     *
78     * @param \context[] $contexts Restrict the analysis to these contexts. No context restrictions if null.
79     * @return null
80     */
81    public function run(array $contexts = []) {
82
83        $options = $this->analyser->get_options();
84
85        // Time limit control.
86        $modeltimelimit = intval(get_config('analytics', 'modeltimelimit'));
87
88        if ($this->includetarget) {
89            $action = 'training';
90        } else {
91            $action = 'prediction';
92        }
93        $analysables = $this->analyser->get_analysables_iterator($action, $contexts);
94
95        $processedanalysables = $this->get_processed_analysables();
96
97        $inittime = microtime(true);
98        foreach ($analysables as $analysable) {
99            $processed = false;
100
101            if (!$analysable) {
102                continue;
103            }
104
105            $analysableresults = $this->process_analysable($analysable);
106            if ($analysableresults) {
107                $processed = $this->result->add_analysable_results($analysableresults);
108                if (!$processed) {
109                    $errors = array();
110                    foreach ($analysableresults as $timesplittingid => $result) {
111                        $str = '';
112                        if (count($analysableresults) > 1) {
113                            $str .= $timesplittingid . ': ';
114                        }
115                        $str .= $result->message;
116                        $errors[] = $str;
117                    }
118
119                    $a = new \stdClass();
120                    $a->analysableid = $analysable->get_name();
121                    $a->errors = implode(', ', $errors);
122                    $this->analyser->add_log(get_string('analysablenotused', 'analytics', $a));
123                }
124            }
125
126            if (!$options['evaluation']) {
127
128                if (empty($processedanalysables[$analysable->get_id()]) ||
129                        $this->analyser->get_target()->always_update_analysis_time() || $processed) {
130                    // We store the list of processed analysables even if the target does not always_update_analysis_time(),
131                    // what always_update_analysis_time controls is the update of the data.
132                    $this->update_analysable_analysed_time($processedanalysables, $analysable->get_id());
133                }
134
135                // Apply time limit.
136                $timespent = microtime(true) - $inittime;
137                if ($modeltimelimit <= $timespent) {
138                    break;
139                }
140            }
141        }
142
143        // Force GC to clean up the indicator instances used during the last iteration.
144        $this->analyser->instantiate_indicators();
145    }
146
147    /**
148     * Get analysables that have been already processed.
149     *
150     * @return \stdClass[]
151     */
152    protected function get_processed_analysables(): array {
153        global $DB;
154
155        $params = array('modelid' => $this->analyser->get_modelid());
156        $params['action'] = ($this->includetarget) ? 'training' : 'prediction';
157        $select = 'modelid = :modelid and action = :action';
158
159        // Weird select fields ordering for performance (analysableid key matching, analysableid is also unique by modelid).
160        return $DB->get_records_select('analytics_used_analysables', $select,
161            $params, 'timeanalysed DESC', 'analysableid, modelid, action, firstanalysis, timeanalysed, id AS primarykey');
162    }
163
164    /**
165     * Processes an analysable
166     *
167     * This method returns the general analysable status, an array of files by time splitting method and
168     * an error message if there is any problem.
169     *
170     * @param \core_analytics\analysable $analysable
171     * @return \stdClass[] Results objects by time splitting method
172     */
173    public function process_analysable(\core_analytics\analysable $analysable): array {
174
175        // Target instances scope is per-analysable (it can't be lower as calculations run once per
176        // analysable, not time splitting method nor time range).
177        $target = call_user_func(array($this->analyser->get_target(), 'instance'));
178
179        // We need to check that the analysable is valid for the target even if we don't include targets
180        // as we still need to discard invalid analysables for the target.
181        $isvalidresult = $target->is_valid_analysable($analysable, $this->includetarget);
182        if ($isvalidresult !== true) {
183            $a = new \stdClass();
184            $a->analysableid = $analysable->get_name();
185            $a->result = $isvalidresult;
186            $this->analyser->add_log(get_string('analysablenotvalidfortarget', 'analytics', $a));
187            return array();
188        }
189
190        // Process all provided time splitting methods.
191        $results = array();
192        foreach ($this->analyser->get_timesplittings() as $timesplitting) {
193
194            $cachedresult = $this->result->retrieve_cached_result($timesplitting, $analysable);
195            if ($cachedresult) {
196                $result = new \stdClass();
197                $result->result = $cachedresult;
198                $results[$timesplitting->get_id()] = $result;
199                continue;
200            }
201
202            $results[$timesplitting->get_id()] = $this->process_time_splitting($timesplitting, $analysable, $target);
203        }
204
205        return $results;
206    }
207
208    /**
209     * Processes the analysable samples using the provided time splitting method.
210     *
211     * @param \core_analytics\local\time_splitting\base $timesplitting
212     * @param \core_analytics\analysable $analysable
213     * @param \core_analytics\local\target\base $target
214     * @return \stdClass Results object.
215     */
216    protected function process_time_splitting(\core_analytics\local\time_splitting\base $timesplitting,
217            \core_analytics\analysable $analysable, \core_analytics\local\target\base $target): \stdClass {
218
219        $options = $this->analyser->get_options();
220
221        $result = new \stdClass();
222
223        $timesplitting->set_modelid($this->analyser->get_modelid());
224        if (!$timesplitting->is_valid_analysable($analysable)) {
225            $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
226            $result->message = get_string('invalidanalysablefortimesplitting', 'analytics',
227                $timesplitting->get_name());
228            return $result;
229        }
230        $timesplitting->set_analysable($analysable);
231
232        if (CLI_SCRIPT && !PHPUNIT_TEST) {
233            mtrace('Analysing id "' . $analysable->get_id() . '" with "' . $timesplitting->get_name() .
234                '" time splitting method...');
235        }
236
237        // What is a sample is defined by the analyser, it can be an enrolment, a course, a user, a question
238        // attempt... it is on what we will base indicators calculations.
239        list($sampleids, $samplesdata) = $this->analyser->get_all_samples($analysable);
240
241        if (count($sampleids) === 0) {
242            $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
243            $result->message = get_string('nodata', 'analytics');
244            return $result;
245        }
246
247        if ($this->includetarget) {
248            // All ranges are used when we are calculating data for training.
249            $ranges = $timesplitting->get_training_ranges();
250        } else {
251            // The latest range that has not yet been used for prediction (it depends on the time range where we are right now).
252            $ranges = $timesplitting->get_most_recent_prediction_range();
253        }
254
255        // There is no need to keep track of the evaluated samples and ranges as we always evaluate the whole dataset.
256        if ($options['evaluation'] === false) {
257
258            if (empty($ranges)) {
259                $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
260                $result->message = get_string('noranges', 'analytics');
261                return $result;
262            }
263
264            // We skip all samples that are already part of a training dataset, even if they have not been used for prediction.
265            if (!$target::based_on_assumptions()) {
266                // Targets based on assumptions can not be trained.
267                $this->filter_out_train_samples($sampleids, $timesplitting);
268            }
269
270            if (count($sampleids) === 0) {
271                $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
272                $result->message = get_string('nonewdata', 'analytics');
273                return $result;
274            }
275
276            // Only when processing data for predictions.
277            if (!$this->includetarget) {
278                // We also filter out samples and ranges that have already been used for predictions.
279                $predictsamplesrecord = $this->filter_out_prediction_samples_and_ranges($sampleids, $ranges, $timesplitting);
280            }
281
282            if (count($sampleids) === 0) {
283                $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
284                $result->message = get_string('nonewdata', 'analytics');
285                return $result;
286            }
287
288            if (count($ranges) === 0) {
289                $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
290                $result->message = get_string('nonewranges', 'analytics');
291                return $result;
292            }
293        }
294
295        // Flag the model + analysable + timesplitting as being analysed (prevent concurrent executions).
296        if (!$this->init_analysable_analysis($timesplitting->get_id(), $analysable->get_id())) {
297            // If this model + analysable + timesplitting combination is being analysed we skip this process.
298            $result->status = \core_analytics\model::NO_DATASET;
299            $result->message = get_string('analysisinprogress', 'analytics');
300            return $result;
301        }
302
303        // Remove samples the target consider invalid.
304        try {
305            $target->add_sample_data($samplesdata);
306            $target->filter_out_invalid_samples($sampleids, $analysable, $this->includetarget);
307        } catch (\Throwable $e) {
308            $this->finish_analysable_analysis();
309            throw $e;
310        }
311
312        if (!$sampleids) {
313            $result->status = \core_analytics\model::NO_DATASET;
314            $result->message = get_string('novalidsamples', 'analytics');
315            $this->finish_analysable_analysis();
316            return $result;
317        }
318
319        try {
320            // Instantiate empty indicators to ensure that no garbage is dragged from previous analyses.
321            $indicators = $this->analyser->instantiate_indicators();
322            foreach ($indicators as $key => $indicator) {
323                // The analyser attaches the main entities the sample depends on and are provided to the
324                // indicator to calculate the sample.
325                $indicators[$key]->add_sample_data($samplesdata);
326            }
327
328            // Here we start the memory intensive process that will last until $data var is
329            // unset (until the method is finished basically).
330            $data = $this->calculate($timesplitting, $sampleids, $ranges, $target);
331        } catch (\Throwable $e) {
332            $this->finish_analysable_analysis();
333            throw $e;
334        }
335
336        if (!$data) {
337            $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
338            $result->message = get_string('novaliddata', 'analytics');
339            $this->finish_analysable_analysis();
340            return $result;
341        }
342
343        try {
344            // No need to keep track of analysed stuff when evaluating.
345            if ($options['evaluation'] === false) {
346                // Save the samples that have been already analysed so they are not analysed again in future.
347
348                if ($this->includetarget) {
349                    $this->save_train_samples($sampleids, $timesplitting);
350                } else {
351                    // The variable $predictsamplesrecord will always be set as filter_out_prediction_samples_and_ranges
352                    // will always be called before it (no evaluation mode and no includetarget).
353                    $this->save_prediction_samples($sampleids, $ranges, $timesplitting, $predictsamplesrecord);
354                }
355            }
356
357            // We need to pass all the analysis data.
358            $formattedresult = $this->result->format_result($data, $target, $timesplitting, $analysable);
359
360        } catch (\Throwable $e) {
361            $this->finish_analysable_analysis();
362            throw $e;
363        }
364
365        if (!$formattedresult) {
366            $this->finish_analysable_analysis();
367            throw new \moodle_exception('errorcannotwritedataset', 'analytics');
368        }
369
370        $result->status = \core_analytics\model::OK;
371        $result->message = get_string('successfullyanalysed', 'analytics');
372        $result->result = $formattedresult;
373
374        // Flag the model + analysable + timesplitting as analysed.
375        $this->finish_analysable_analysis();
376
377        return $result;
378    }
379
380    /**
381     * Calculates indicators and targets.
382     *
383     * @param \core_analytics\local\time_splitting\base $timesplitting
384     * @param array $sampleids
385     * @param array $ranges
386     * @param \core_analytics\local\target\base $target
387     * @return array|null
388     */
389    public function calculate(\core_analytics\local\time_splitting\base $timesplitting, array &$sampleids,
390            array $ranges, \core_analytics\local\target\base $target): ?array {
391
392        $calculatedtarget = null;
393        if ($this->includetarget) {
394            // We first calculate the target because analysable data may still be invalid or none
395            // of the analysable samples may be valid.
396            $calculatedtarget = $target->calculate($sampleids, $timesplitting->get_analysable());
397
398            // We remove samples we can not calculate their target.
399            $sampleids = array_filter($sampleids, function($sampleid) use ($calculatedtarget) {
400                if (is_null($calculatedtarget[$sampleid])) {
401                    return false;
402                }
403                return true;
404            });
405        }
406
407        // No need to continue calculating if the target couldn't be calculated for any sample.
408        if (empty($sampleids)) {
409            return null;
410        }
411
412        $dataset = $this->calculate_indicators($timesplitting, $sampleids, $ranges);
413
414        if (empty($dataset)) {
415            return null;
416        }
417
418        // Now that we have the indicators in place we can add the time range indicators (and target if provided) to each of them.
419        $this->fill_dataset($timesplitting, $dataset, $calculatedtarget);
420
421        $this->add_context_metadata($timesplitting, $dataset, $target);
422
423        if (!PHPUNIT_TEST && CLI_SCRIPT) {
424            echo PHP_EOL;
425        }
426
427        return $dataset;
428    }
429
430    /**
431     * Calculates indicators.
432     *
433     * @param \core_analytics\local\time_splitting\base $timesplitting
434     * @param array $sampleids
435     * @param array $ranges
436     * @return array
437     */
438    protected function calculate_indicators(\core_analytics\local\time_splitting\base $timesplitting, array $sampleids,
439            array $ranges): array {
440        global $DB;
441
442        $options = $this->analyser->get_options();
443
444        $dataset = array();
445
446        // Faster to run 1 db query per range.
447        $existingcalculations = array();
448        if ($timesplitting->cache_indicator_calculations()) {
449            foreach ($ranges as $rangeindex => $range) {
450                // Load existing calculations.
451                $existingcalculations[$rangeindex] = \core_analytics\manager::get_indicator_calculations(
452                    $timesplitting->get_analysable(), $range['start'], $range['end'], $this->analyser->get_samples_origin());
453            }
454        }
455
456        // Here we store samples which calculations are not all null.
457        $notnulls = array();
458
459        // Fill the dataset samples with indicators data.
460        $newcalculations = array();
461        foreach ($this->analyser->get_indicators() as $indicator) {
462
463            // Hook to allow indicators to store analysable-dependant data.
464            $indicator->fill_per_analysable_caches($timesplitting->get_analysable());
465
466            // Per-range calculations.
467            foreach ($ranges as $rangeindex => $range) {
468
469                // Indicator instances are per-range.
470                $rangeindicator = clone $indicator;
471
472                $prevcalculations = array();
473                if (!empty($existingcalculations[$rangeindex][$rangeindicator->get_id()])) {
474                    $prevcalculations = $existingcalculations[$rangeindex][$rangeindicator->get_id()];
475                }
476
477                // Calculate the indicator for each sample in this time range.
478                list($samplesfeatures, $newindicatorcalculations, $indicatornotnulls) = $rangeindicator->calculate($sampleids,
479                    $this->analyser->get_samples_origin(), $range['start'], $range['end'], $prevcalculations);
480
481                // Associate the extra data generated by the indicator to this range index.
482                $rangeindicator->save_calculation_info($timesplitting, $rangeindex);
483
484                // Free memory ASAP.
485                unset($rangeindicator);
486                gc_collect_cycles();
487                gc_mem_caches();
488
489                // Copy the features data to the dataset.
490                foreach ($samplesfeatures as $analysersampleid => $features) {
491
492                    $uniquesampleid = $timesplitting->append_rangeindex($analysersampleid, $rangeindex);
493
494                    if (!isset($notnulls[$uniquesampleid]) && !empty($indicatornotnulls[$analysersampleid])) {
495                        $notnulls[$uniquesampleid] = $uniquesampleid;
496                    }
497
498                    // Init the sample if it is still empty.
499                    if (!isset($dataset[$uniquesampleid])) {
500                        $dataset[$uniquesampleid] = array();
501                    }
502
503                    // Append the features indicator features at the end of the sample.
504                    $dataset[$uniquesampleid] = array_merge($dataset[$uniquesampleid], $features);
505                }
506
507                if (!$options['evaluation'] && $timesplitting->cache_indicator_calculations()) {
508                    $timecreated = time();
509                    foreach ($newindicatorcalculations as $sampleid => $calculatedvalue) {
510                        // Prepare the new calculations to be stored into DB.
511
512                        $indcalc = new \stdClass();
513                        $indcalc->contextid = $timesplitting->get_analysable()->get_context()->id;
514                        $indcalc->starttime = $range['start'];
515                        $indcalc->endtime = $range['end'];
516                        $indcalc->sampleid = $sampleid;
517                        $indcalc->sampleorigin = $this->analyser->get_samples_origin();
518                        $indcalc->indicator = $indicator->get_id();
519                        $indcalc->value = $calculatedvalue;
520                        $indcalc->timecreated = $timecreated;
521                        $newcalculations[] = $indcalc;
522                    }
523                }
524            }
525
526            if (!$options['evaluation'] && $timesplitting->cache_indicator_calculations()) {
527                $batchsize = self::get_insert_batch_size();
528                if (count($newcalculations) > $batchsize) {
529                    // We don't want newcalculations array to grow too much as we already keep the
530                    // system memory busy storing $dataset contents.
531
532                    // Insert from the beginning.
533                    $remaining = array_splice($newcalculations, $batchsize);
534
535                    // Sorry mssql and oracle, this will be slow.
536                    $DB->insert_records('analytics_indicator_calc', $newcalculations);
537                    $newcalculations = $remaining;
538                }
539            }
540        }
541
542        if (!$options['evaluation'] && $timesplitting->cache_indicator_calculations() && $newcalculations) {
543            // Insert the remaining records.
544            $DB->insert_records('analytics_indicator_calc', $newcalculations);
545        }
546
547        // Delete rows where all calculations are null.
548        // We still store the indicator calculation and we still store the sample id as
549        // processed so we don't have to process this sample again, but we exclude it
550        // from the dataset because it is not useful.
551        $nulls = array_diff_key($dataset, $notnulls);
552        foreach ($nulls as $uniqueid => $ignoredvalues) {
553            unset($dataset[$uniqueid]);
554        }
555
556        return $dataset;
557    }
558
559    /**
560     * Adds time range indicators and the target to each sample.
561     *
562     * This will identify the sample as belonging to a specific range.
563     *
564     * @param \core_analytics\local\time_splitting\base $timesplitting
565     * @param array $dataset
566     * @param array|null $calculatedtarget
567     * @return null
568     */
569    protected function fill_dataset(\core_analytics\local\time_splitting\base $timesplitting,
570            array &$dataset, ?array $calculatedtarget = null) {
571
572        $nranges = count($timesplitting->get_distinct_ranges());
573
574        foreach ($dataset as $uniquesampleid => $unmodified) {
575
576            list($analysersampleid, $rangeindex) = $timesplitting->infer_sample_info($uniquesampleid);
577
578            // No need to add range features if this time splitting method only defines one time range.
579            if ($nranges > 1) {
580
581                // 1 column for each range.
582                $timeindicators = array_fill(0, $nranges, 0);
583
584                $timeindicators[$rangeindex] = 1;
585
586                $dataset[$uniquesampleid] = array_merge($timeindicators, $dataset[$uniquesampleid]);
587            }
588
589            if ($calculatedtarget) {
590                // Add this sampleid's calculated target and the end.
591                $dataset[$uniquesampleid][] = $calculatedtarget[$analysersampleid];
592
593            } else {
594                // Add this sampleid, it will be used to identify the prediction that comes back from
595                // the predictions processor.
596                array_unshift($dataset[$uniquesampleid], $uniquesampleid);
597            }
598        }
599    }
600
601    /**
602     * Updates the analysable analysis time.
603     *
604     * @param array $processedanalysables
605     * @param int $analysableid
606     * @return null
607     */
608    protected function update_analysable_analysed_time(array $processedanalysables, int $analysableid) {
609        global $DB;
610
611        $now = time();
612
613        if (!empty($processedanalysables[$analysableid])) {
614            $obj = $processedanalysables[$analysableid];
615
616            $obj->id = $obj->primarykey;
617            unset($obj->primarykey);
618
619            $obj->timeanalysed = $now;
620
621            $DB->update_record('analytics_used_analysables', $obj);
622
623        } else {
624
625            $obj = new \stdClass();
626            $obj->modelid = $this->analyser->get_modelid();
627            $obj->action = ($this->includetarget) ? 'training' : 'prediction';
628            $obj->analysableid = $analysableid;
629            $obj->firstanalysis = $now;
630            $obj->timeanalysed = $now;
631
632            $obj->primarykey = $DB->insert_record('analytics_used_analysables', $obj);
633
634            // Update the cache just in case it is used in the same request.
635            $key = $this->analyser->get_modelid() . '_' . $analysableid;
636            $cache = \cache::make('core', 'modelfirstanalyses');
637            $cache->set($key, $now);
638        }
639    }
640
641    /**
642     * Fills a cache containing the first time each analysable in the provided model was analysed.
643     *
644     * @param int $modelid
645     * @param int|null $analysableid
646     * @return null
647     */
648    public static function fill_firstanalyses_cache(int $modelid, ?int $analysableid = null) {
649        global $DB;
650
651        // Using composed keys instead of cache $identifiers because of MDL-65358.
652        $primarykey = $DB->sql_concat($modelid, "'_'", 'analysableid');
653        $sql = "SELECT $primarykey AS id, MIN(firstanalysis) AS firstanalysis
654                  FROM {analytics_used_analysables} aua
655                 WHERE modelid = :modelid";
656        $params = ['modelid' => $modelid];
657
658        if ($analysableid) {
659            $sql .= " AND analysableid = :analysableid";
660            $params['analysableid'] = $analysableid;
661        }
662
663        $sql .= " GROUP BY modelid, analysableid ORDER BY analysableid";
664
665        $firstanalyses = $DB->get_records_sql($sql, $params);
666        if ($firstanalyses) {
667            $cache = \cache::make('core', 'modelfirstanalyses');
668
669            $firstanalyses = array_map(function($record) {
670                return $record->firstanalysis;
671            }, $firstanalyses);
672
673            $cache->set_many($firstanalyses);
674        }
675
676        return $firstanalyses;
677    }
678
679    /**
680     * Adds dataset context info.
681     *
682     * The final dataset document will look like this:
683     * ----------------------------------------------------
684     * metadata1,metadata2,metadata3,.....
685     * value1, value2, value3,.....
686     *
687     * header1,header2,header3,header4,.....
688     * stud1value1,stud1value2,stud1value3,stud1value4,.....
689     * stud2value1,stud2value2,stud2value3,stud2value4,.....
690     * .....
691     * ----------------------------------------------------
692     *
693     * @param \core_analytics\local\time_splitting\base $timesplitting
694     * @param array $dataset
695     * @param \core_analytics\local\target\base $target
696     * @return null
697     */
698    protected function add_context_metadata(\core_analytics\local\time_splitting\base $timesplitting, array &$dataset,
699            \core_analytics\local\target\base $target) {
700        $headers = $this->get_headers($timesplitting, $target);
701
702        // This will also reset samples' dataset keys.
703        array_unshift($dataset, $headers);
704    }
705
706    /**
707     * Returns the headers for the csv file based on the indicators and the target.
708     *
709     * @param \core_analytics\local\time_splitting\base $timesplitting
710     * @param \core_analytics\local\target\base $target
711     * @return string[]
712     */
713    public function get_headers(\core_analytics\local\time_splitting\base $timesplitting,
714            \core_analytics\local\target\base $target): array {
715        // 3rd column will contain the indicator ids.
716        $headers = array();
717
718        if (!$this->includetarget) {
719            // The first column is the sampleid.
720            $headers[] = 'sampleid';
721        }
722
723        // We always have 1 column for each time splitting method range, it does not depend on how
724        // many ranges we calculated.
725        $ranges = $timesplitting->get_distinct_ranges();
726        if (count($ranges) > 1) {
727            foreach ($ranges as $rangeindex) {
728                $headers[] = 'range/' . $rangeindex;
729            }
730        }
731
732        // Model indicators.
733        foreach ($this->analyser->get_indicators() as $indicator) {
734            $headers = array_merge($headers, $indicator::get_feature_headers());
735        }
736
737        // The target as well.
738        if ($this->includetarget) {
739            $headers[] = $target->get_id();
740        }
741
742        return $headers;
743    }
744
745    /**
746     * Filters out samples that have already been used for training.
747     *
748     * @param int[] $sampleids
749     * @param \core_analytics\local\time_splitting\base $timesplitting
750     * @return  null
751     */
752    protected function filter_out_train_samples(array &$sampleids, \core_analytics\local\time_splitting\base $timesplitting) {
753        global $DB;
754
755        $params = array('modelid' => $this->analyser->get_modelid(), 'analysableid' => $timesplitting->get_analysable()->get_id(),
756            'timesplitting' => $timesplitting->get_id());
757
758        $trainingsamples = $DB->get_records('analytics_train_samples', $params);
759
760        // Skip each file trained samples.
761        foreach ($trainingsamples as $trainingfile) {
762
763            $usedsamples = json_decode($trainingfile->sampleids, true);
764
765            if (!empty($usedsamples)) {
766                // Reset $sampleids to $sampleids minus this file's $usedsamples.
767                $sampleids = array_diff_key($sampleids, $usedsamples);
768            }
769        }
770    }
771
772    /**
773     * Filters out samples that have already been used for prediction.
774     *
775     * @param int[] $sampleids
776     * @param array $ranges
777     * @param \core_analytics\local\time_splitting\base $timesplitting
778     * @return  \stdClass|null The analytics_predict_samples record or null
779     */
780    protected function filter_out_prediction_samples_and_ranges(array &$sampleids, array &$ranges,
781            \core_analytics\local\time_splitting\base $timesplitting) {
782
783        if (count($ranges) > 1) {
784            throw new \coding_exception('$ranges argument should only contain one range');
785        }
786
787        $rangeindex = key($ranges);
788        $predictedrange = $this->get_predict_samples_record($timesplitting, $rangeindex);
789
790        if (!$predictedrange) {
791            // Nothing to filter out.
792            return null;
793        }
794
795        $predictedrange->sampleids = json_decode($predictedrange->sampleids, true);
796        $missingsamples = array_diff_key($sampleids, $predictedrange->sampleids);
797        if (count($missingsamples) === 0) {
798            // All samples already calculated.
799            unset($ranges[$rangeindex]);
800            return null;
801        }
802
803        // Replace the list of samples by the one excluding samples that already got predictions at this range.
804        $sampleids = $missingsamples;
805
806        return $predictedrange;
807    }
808
809    /**
810     * Returns a predict samples record.
811     *
812     * @param  \core_analytics\local\time_splitting\base $timesplitting
813     * @param  int                                       $rangeindex
814     * @return \stdClass|false
815     */
816    private function get_predict_samples_record(\core_analytics\local\time_splitting\base $timesplitting, int $rangeindex) {
817        global $DB;
818
819        $params = array('modelid' => $this->analyser->get_modelid(), 'analysableid' => $timesplitting->get_analysable()->get_id(),
820            'timesplitting' => $timesplitting->get_id(), 'rangeindex' => $rangeindex);
821        $predictedrange = $DB->get_record('analytics_predict_samples', $params);
822
823        return $predictedrange;
824    }
825
826    /**
827     * Saves samples that have just been used for training.
828     *
829     * @param int[] $sampleids
830     * @param \core_analytics\local\time_splitting\base $timesplitting
831     * @return null
832     */
833    protected function save_train_samples(array $sampleids, \core_analytics\local\time_splitting\base $timesplitting) {
834        global $DB;
835
836        $trainingsamples = new \stdClass();
837        $trainingsamples->modelid = $this->analyser->get_modelid();
838        $trainingsamples->analysableid = $timesplitting->get_analysable()->get_id();
839        $trainingsamples->timesplitting = $timesplitting->get_id();
840
841        $trainingsamples->sampleids = json_encode($sampleids);
842        $trainingsamples->timecreated = time();
843
844        $DB->insert_record('analytics_train_samples', $trainingsamples);
845    }
846
847    /**
848     * Saves samples that have just been used for prediction.
849     *
850     * @param int[] $sampleids
851     * @param array $ranges
852     * @param \core_analytics\local\time_splitting\base $timesplitting
853     * @param \stdClass|null $predictsamplesrecord The existing record or null if there is no record yet.
854     * @return null
855     */
856    protected function save_prediction_samples(array $sampleids, array $ranges,
857            \core_analytics\local\time_splitting\base $timesplitting, ?\stdClass $predictsamplesrecord = null) {
858        global $DB;
859
860        if (count($ranges) > 1) {
861            throw new \coding_exception('$ranges argument should only contain one range');
862        }
863
864        $rangeindex = key($ranges);
865
866        if ($predictsamplesrecord) {
867            // Append the new samples used for prediction.
868            $predictsamplesrecord->sampleids = json_encode($predictsamplesrecord->sampleids + $sampleids);
869            $predictsamplesrecord->timemodified = time();
870            $DB->update_record('analytics_predict_samples', $predictsamplesrecord);
871        } else {
872            $predictsamplesrecord = (object)[
873                'modelid' => $this->analyser->get_modelid(),
874                'analysableid' => $timesplitting->get_analysable()->get_id(),
875                'timesplitting' => $timesplitting->get_id(), 'rangeindex' => $rangeindex
876            ];
877            $predictsamplesrecord->sampleids = json_encode($sampleids);
878            $predictsamplesrecord->timecreated = time();
879            $predictsamplesrecord->timemodified = $predictsamplesrecord->timecreated;
880            $DB->insert_record('analytics_predict_samples', $predictsamplesrecord);
881        }
882    }
883
884    /**
885     * Flags the analysable element as in-analysis and stores a lock for it.
886     *
887     * @param  string $timesplittingid
888     * @param  int    $analysableid
889     * @return bool Success or not
890     */
891    private function init_analysable_analysis(string $timesplittingid, int $analysableid) {
892
893        // Do not include $this->includetarget as we don't want the same analysable to be analysed for training
894        // and prediction at the same time.
895        $lockkey = 'modelid:' . $this->analyser->get_modelid() . '-analysableid:' . $analysableid .
896            '-timesplitting:' . self::clean_time_splitting_id($timesplittingid);
897
898        // Large timeout as processes may be quite long.
899        $lockfactory = \core\lock\lock_config::get_lock_factory('core_analytics');
900
901        // If it is not ready in 10 secs skip this model + analysable + timesplittingmethod combination
902        // it will attempt it again during next cron run.
903        if (!$this->lock = $lockfactory->get_lock($lockkey, 10)) {
904            return false;
905        }
906        return true;
907    }
908
909
910    /**
911     * Remove all possibly problematic chars from the time splitting method id (id = its full class name).
912     *
913     * @param string $timesplittingid
914     * @return string
915     */
916    public static function clean_time_splitting_id($timesplittingid) {
917        $timesplittingid = str_replace('\\', '-', $timesplittingid);
918        return clean_param($timesplittingid, PARAM_ALPHANUMEXT);
919    }
920
921    /**
922     * Mark the currently analysed analysable+timesplitting as analysed.
923     *
924     * @return null
925     */
926    private function finish_analysable_analysis() {
927        $this->lock->release();
928    }
929
930    /**
931     * Returns the batch size used for insert_records.
932     *
933     * This method tries to find the best batch size without getting
934     * into dml internals. Maximum 1000 records to save memory.
935     *
936     * @return int
937     */
938    private static function get_insert_batch_size(): int {
939        global $DB;
940
941        $dbconfig = $DB->export_dbconfig();
942
943        // 500 is pgsql default so using 1000 is fine, no other db driver uses a hardcoded value.
944        if (empty($dbconfig) || empty($dbconfig->dboptions) || empty($dbconfig->dboptions['bulkinsertsize'])) {
945            return 1000;
946        }
947
948        $bulkinsert = $dbconfig->dboptions['bulkinsertsize'];
949        if ($bulkinsert < 1000) {
950            return $bulkinsert;
951        }
952
953        while ($bulkinsert > 1000) {
954            $bulkinsert = round($bulkinsert / 2, 0);
955        }
956
957        return (int)$bulkinsert;
958    }
959}
960