1<?php
2// This file is part of Moodle - http://moodle.org/
3//
4// Moodle is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// Moodle is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with Moodle.  If not, see <http://www.gnu.org/licenses/>.
16
17/**
18 * Document representation.
19 *
20 * @package    core_search
21 * @copyright  2015 David Monllao {@link http://www.davidmonllao.com}
22 * @license    http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
23 */
24
25namespace core_search;
26
27defined('MOODLE_INTERNAL') || die();
28
29/**
30 * Represents a document to index.
31 *
32 * Note that, if you are writting a search engine and you want to change \core_search\document
33 * behaviour, you can overwrite this class, will be automatically loaded from \search_YOURENGINE\document.
34 *
35 * @package    core_search
36 * @copyright  2015 David Monllao {@link http://www.davidmonllao.com}
37 * @license    http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
38 */
39class document implements \renderable, \templatable {
40
41    /**
42     * @var array $data The document data.
43     */
44    protected $data = array();
45
46    /**
47     * @var array Extra data needed to render the document.
48     */
49    protected $extradata = array();
50
51    /**
52     * @var \moodle_url Link to the document.
53     */
54    protected $docurl = null;
55
56    /**
57     * @var \moodle_url Link to the document context.
58     */
59    protected $contexturl = null;
60
61    /**
62     * @var \core_search\document_icon Document icon instance.
63     */
64    protected $docicon = null;
65
66    /**
67     * @var int|null The content field filearea.
68     */
69    protected $contentfilearea = null;
70
71    /**
72     * @var int|null The content field itemid.
73     */
74    protected $contentitemid = null;
75
76    /**
77     * @var bool Should be set to true if document hasn't been indexed before. False if unknown.
78     */
79    protected $isnew = false;
80
81    /**
82     * @var \stored_file[] An array of stored files to attach to the document.
83     */
84    protected $files = array();
85
86    /**
87     * Change list (for engine implementers):
88     * 2017091700 - add optional field groupid
89     *
90     * @var int Schema version number (update if any change)
91     */
92    const SCHEMA_VERSION = 2017091700;
93
94    /**
95     * All required fields any doc should contain.
96     *
97     * We have to choose a format to specify field types, using solr format as we have to choose one and solr is the
98     * default search engine.
99     *
100     * Search engine plugins are responsible of setting their appropriate field types and map these naming to whatever format
101     * they need.
102     *
103     * @var array
104     */
105    protected static $requiredfields = array(
106        'id' => array(
107            'type' => 'string',
108            'stored' => true,
109            'indexed' => false
110        ),
111        'itemid' => array(
112            'type' => 'int',
113            'stored' => true,
114            'indexed' => true
115        ),
116        'title' => array(
117            'type' => 'text',
118            'stored' => true,
119            'indexed' => true,
120            'mainquery' => true
121        ),
122        'content' => array(
123            'type' => 'text',
124            'stored' => true,
125            'indexed' => true,
126            'mainquery' => true
127        ),
128        'contextid' => array(
129            'type' => 'int',
130            'stored' => true,
131            'indexed' => true
132        ),
133        'areaid' => array(
134            'type' => 'string',
135            'stored' => true,
136            'indexed' => true
137        ),
138        'type' => array(
139            'type' => 'int',
140            'stored' => true,
141            'indexed' => true
142        ),
143        'courseid' => array(
144            'type' => 'int',
145            'stored' => true,
146            'indexed' => true
147        ),
148        'owneruserid' => array(
149            'type' => 'int',
150            'stored' => true,
151            'indexed' => true
152        ),
153        'modified' => array(
154            'type' => 'tdate',
155            'stored' => true,
156            'indexed' => true
157        ),
158    );
159
160    /**
161     * All optional fields docs can contain.
162     *
163     * Although it matches solr fields format, this is just to define the field types. Search
164     * engine plugins are responsible of setting their appropriate field types and map these
165     * naming to whatever format they need.
166     *
167     * @var array
168     */
169    protected static $optionalfields = array(
170        'userid' => array(
171            'type' => 'int',
172            'stored' => true,
173            'indexed' => true
174        ),
175        'groupid' => array(
176            'type' => 'int',
177            'stored' => true,
178            'indexed' => true
179        ),
180        'description1' => array(
181            'type' => 'text',
182            'stored' => true,
183            'indexed' => true,
184            'mainquery' => true
185        ),
186        'description2' => array(
187            'type' => 'text',
188            'stored' => true,
189            'indexed' => true,
190            'mainquery' => true
191        )
192    );
193
194    /**
195     * Any fields that are engine specifc. These are fields that are solely used by a search engine plugin
196     * for internal purposes.
197     *
198     * Field names should be prefixed with engine name to avoid potential conflict with core fields.
199     *
200     * Uses same format as fields above.
201     *
202     * @var array
203     */
204    protected static $enginefields = array();
205
206    /**
207     * We ensure that the document has a unique id across search areas.
208     *
209     * @param int $itemid An id unique to the search area
210     * @param string $componentname The search area component Frankenstyle name
211     * @param string $areaname The area name (the search area class name)
212     * @return void
213     */
214    public function __construct($itemid, $componentname, $areaname) {
215
216        if (!is_numeric($itemid)) {
217            throw new \coding_exception('The itemid should be an integer');
218        }
219
220        $this->data['areaid'] = \core_search\manager::generate_areaid($componentname, $areaname);
221        $this->data['id'] = $this->data['areaid'] . '-' . $itemid;
222        $this->data['itemid'] = intval($itemid);
223    }
224
225    /**
226     * Add a stored file to the document.
227     *
228     * @param \stored_file|int $file The file to add, or file id.
229     * @return void
230     */
231    public function add_stored_file($file) {
232        if (is_numeric($file)) {
233            $this->files[$file] = $file;
234        } else {
235            $this->files[$file->get_id()] = $file;
236        }
237    }
238
239    /**
240     * Returns the array of attached files.
241     *
242     * @return \stored_file[]
243     */
244    public function get_files() {
245        // The files array can contain stored file ids, so we need to get instances if asked.
246        foreach ($this->files as $id => $listfile) {
247            if (is_numeric($listfile)) {
248                $fs = get_file_storage();
249
250                if ($file = $fs->get_file_by_id($id)) {
251                    $this->files[$id] = $file;
252                } else {
253                    unset($this->files[$id]); // Index is out of date and referencing a file that does not exist.
254                }
255            }
256        }
257
258        return $this->files;
259    }
260
261    /**
262     * Setter.
263     *
264     * Basic checkings to prevent common issues.
265     *
266     * If the field is a string tags will be stripped, if it is an integer or a date it
267     * will be casted to a PHP integer. tdate fields values are expected to be timestamps.
268     *
269     * @throws \coding_exception
270     * @param string $fieldname The field name
271     * @param string|int $value The value to store
272     * @return string|int The stored value
273     */
274    public function set($fieldname, $value) {
275
276        if (!empty(static::$requiredfields[$fieldname])) {
277            $fielddata = static::$requiredfields[$fieldname];
278        } else if (!empty(static::$optionalfields[$fieldname])) {
279            $fielddata = static::$optionalfields[$fieldname];
280        } else if (!empty(static::$enginefields[$fieldname])) {
281            $fielddata = static::$enginefields[$fieldname];
282        }
283
284        if (empty($fielddata)) {
285            throw new \coding_exception('"' . $fieldname . '" field does not exist.');
286        }
287
288        // tdate fields should be set as timestamps, later they might be converted to
289        // a date format, it depends on the search engine.
290        if (($fielddata['type'] === 'int' || $fielddata['type'] === 'tdate') && !is_numeric($value)) {
291            throw new \coding_exception('"' . $fieldname . '" value should be an integer and its value is "' . $value . '"');
292        }
293
294        // We want to be strict here, there might be engines that expect us to
295        // provide them data with the proper type already set.
296        if ($fielddata['type'] === 'int' || $fielddata['type'] === 'tdate') {
297            $this->data[$fieldname] = intval($value);
298        } else {
299            // Remove disallowed Unicode characters.
300            $value = \core_text::remove_unicode_non_characters($value);
301
302            // Replace all groups of line breaks and spaces by single spaces.
303            $this->data[$fieldname] = preg_replace("/\s+/u", " ", $value);
304            if ($this->data[$fieldname] === null) {
305                if (isset($this->data['id'])) {
306                    $docid = $this->data['id'];
307                } else {
308                    $docid = '(unknown)';
309                }
310                throw new \moodle_exception('error_indexing', 'search', '', null, '"' . $fieldname .
311                        '" value causes preg_replace error (may be caused by unusual characters) ' .
312                        'in document with id "' . $docid . '"');
313            }
314        }
315
316        return $this->data[$fieldname];
317    }
318
319    /**
320     * Sets data to this->extradata
321     *
322     * This data can be retrieved using \core_search\document->get($fieldname).
323     *
324     * @param string $fieldname
325     * @param string $value
326     * @return void
327     */
328    public function set_extra($fieldname, $value) {
329        $this->extradata[$fieldname] = $value;
330    }
331
332    /**
333     * Getter.
334     *
335     * Use self::is_set if you are not sure if this field is set or not
336     * as otherwise it will trigger a \coding_exception
337     *
338     * @throws \coding_exception
339     * @param string $field
340     * @return string|int
341     */
342    public function get($field) {
343
344        if (isset($this->data[$field])) {
345            return $this->data[$field];
346        }
347
348        // Fallback to extra data.
349        if (isset($this->extradata[$field])) {
350            return $this->extradata[$field];
351        }
352
353        throw new \coding_exception('Field "' . $field . '" is not set in the document');
354    }
355
356    /**
357     * Checks if a field is set.
358     *
359     * @param string $field
360     * @return bool
361     */
362    public function is_set($field) {
363        return (isset($this->data[$field]) || isset($this->extradata[$field]));
364    }
365
366    /**
367     * Set if this is a new document. False if unknown.
368     *
369     * @param bool $new
370     */
371    public function set_is_new($new) {
372       $this->isnew = (bool)$new;
373    }
374
375    /**
376     * Returns if the document is new. False if unknown.
377     *
378     * @return bool
379     */
380    public function get_is_new() {
381       return $this->isnew;
382    }
383
384    /**
385     * Returns all default fields definitions.
386     *
387     * @return array
388     */
389    public static function get_default_fields_definition() {
390        return static::$requiredfields + static::$optionalfields + static::$enginefields;
391    }
392
393    /**
394     * Formats the timestamp preparing the time fields to be inserted into the search engine.
395     *
396     * By default it just returns a timestamp so any search engine could just store integers
397     * and use integers comparison to get documents between x and y timestamps, but search
398     * engines might be interested in using their own field formats. They can do it extending
399     * this class in \search_xxx\document.
400     *
401     * @param int $timestamp
402     * @return string
403     */
404    public static function format_time_for_engine($timestamp) {
405        return $timestamp;
406    }
407
408    /**
409     * Formats a string value for the search engine.
410     *
411     * Search engines may overwrite this method to apply restrictions, like limiting the size.
412     * The default behaviour is just returning the string.
413     *
414     * @param string $string
415     * @return string
416     */
417    public static function format_string_for_engine($string) {
418        return $string;
419    }
420
421    /**
422     * Formats a text value for the search engine.
423     *
424     * Search engines may overwrite this method to apply restrictions, like limiting the size.
425     * The default behaviour is just returning the string.
426     *
427     * @param string $text
428     * @return string
429     */
430    public static function format_text_for_engine($text) {
431        return $text;
432    }
433
434    /**
435     * Returns a timestamp from the value stored in the search engine.
436     *
437     * By default it just returns a timestamp so any search engine could just store integers
438     * and use integers comparison to get documents between x and y timestamps, but search
439     * engines might be interested in using their own field formats. They should do it extending
440     * this class in \search_xxx\document.
441     *
442     * @param string $time
443     * @return int
444     */
445    public static function import_time_from_engine($time) {
446        return $time;
447    }
448
449    /**
450     * Returns how text is returned from the search engine.
451     *
452     * @return int
453     */
454    protected function get_text_format() {
455        return FORMAT_PLAIN;
456    }
457
458    /**
459     * Fills the document with data coming from the search engine.
460     *
461     * @throws \core_search\engine_exception
462     * @param array $docdata
463     * @return void
464     */
465    public function set_data_from_engine($docdata) {
466        $fields = static::$requiredfields + static::$optionalfields + static::$enginefields;
467        foreach ($fields as $fieldname => $field) {
468
469            // Optional params might not be there.
470            if (isset($docdata[$fieldname])) {
471                if ($field['type'] === 'tdate') {
472                    // Time fields may need a preprocessing.
473                    $this->set($fieldname, static::import_time_from_engine($docdata[$fieldname]));
474                } else {
475                    // No way we can make this work if there is any multivalue field.
476                    if (is_array($docdata[$fieldname])) {
477                        throw new \core_search\engine_exception('multivaluedfield', 'search_solr', '', $fieldname);
478                    }
479                    $this->set($fieldname, $docdata[$fieldname]);
480                }
481            }
482        }
483    }
484
485    /**
486     * Sets the document url.
487     *
488     * @param \moodle_url $url
489     * @return void
490     */
491    public function set_doc_url(\moodle_url $url) {
492        $this->docurl = $url;
493    }
494
495    /**
496     * Gets the url to the doc.
497     *
498     * @return \moodle_url
499     */
500    public function get_doc_url() {
501        return $this->docurl;
502    }
503
504    /**
505     * Sets document icon instance.
506     *
507     * @param \core_search\document_icon $docicon
508     */
509    public function set_doc_icon(document_icon $docicon) {
510        $this->docicon = $docicon;
511    }
512
513    /**
514     * Gets document icon instance.
515     *
516     * @return \core_search\document_icon
517     */
518    public function get_doc_icon() {
519        return $this->docicon;
520    }
521
522    public function set_context_url(\moodle_url $url) {
523        $this->contexturl = $url;
524    }
525
526    /**
527     * Gets the url to the context.
528     *
529     * @return \moodle_url
530     */
531    public function get_context_url() {
532        return $this->contexturl;
533    }
534
535    /**
536     * Returns the document ready to submit to the search engine.
537     *
538     * @throws \coding_exception
539     * @return array
540     */
541    public function export_for_engine() {
542        // Set any unset defaults.
543        $this->apply_defaults();
544
545        // We don't want to affect the document instance.
546        $data = $this->data;
547
548        // Apply specific engine-dependant formats and restrictions.
549        foreach (static::$requiredfields as $fieldname => $field) {
550
551            // We also check that we have everything we need.
552            if (!isset($data[$fieldname])) {
553                throw new \coding_exception('Missing "' . $fieldname . '" field in document with id "' . $this->data['id'] . '"');
554            }
555
556            if ($field['type'] === 'tdate') {
557                // Overwrite the timestamp with the engine dependant format.
558                $data[$fieldname] = static::format_time_for_engine($data[$fieldname]);
559            } else if ($field['type'] === 'string') {
560                // Overwrite the string with the engine dependant format.
561                $data[$fieldname] = static::format_string_for_engine($data[$fieldname]);
562            } else if ($field['type'] === 'text') {
563                // Overwrite the text with the engine dependant format.
564                $data[$fieldname] = static::format_text_for_engine($data[$fieldname]);
565            }
566
567        }
568
569        $fields = static::$optionalfields + static::$enginefields;
570        foreach ($fields as $fieldname => $field) {
571            if (!isset($data[$fieldname])) {
572                continue;
573            }
574            if ($field['type'] === 'tdate') {
575                // Overwrite the timestamp with the engine dependant format.
576                $data[$fieldname] = static::format_time_for_engine($data[$fieldname]);
577            } else if ($field['type'] === 'string') {
578                // Overwrite the string with the engine dependant format.
579                $data[$fieldname] = static::format_string_for_engine($data[$fieldname]);
580            } else if ($field['type'] === 'text') {
581                // Overwrite the text with the engine dependant format.
582                $data[$fieldname] = static::format_text_for_engine($data[$fieldname]);
583            }
584        }
585
586        return $data;
587    }
588
589    /**
590     * Apply any defaults to unset fields before export. Called after document building, but before export.
591     *
592     * Sub-classes of this should make sure to call parent::apply_defaults().
593     */
594    protected function apply_defaults() {
595        // Set the default type, TYPE_TEXT.
596        if (!isset($this->data['type'])) {
597            $this->data['type'] = manager::TYPE_TEXT;
598        }
599    }
600
601    /**
602     * Export the document data to be used as a template context.
603     *
604     * Adding more info than the required one as people might be interested in extending the template.
605     *
606     * Although content is a required field when setting up the document, it accepts '' (empty) values
607     * as they may be the result of striping out HTML.
608     *
609     * SECURITY NOTE: It is the responsibility of the document to properly escape any text to be displayed.
610     * The renderer will output the content without any further cleaning.
611     *
612     * @param renderer_base $output The renderer.
613     * @return array
614     */
615    public function export_for_template(\renderer_base $output) {
616        list($componentname, $areaname) = \core_search\manager::extract_areaid_parts($this->get('areaid'));
617
618        $searcharea = \core_search\manager::get_search_area($this->data['areaid']);
619        $title = $this->is_set('title') ? $this->format_text($searcharea->get_document_display_title($this)) : '';
620        $data = [
621            'componentname' => $componentname,
622            'areaname' => $areaname,
623            'courseurl' => course_get_url($this->get('courseid')),
624            'coursefullname' => format_string($this->get('coursefullname'), true, array('context' => $this->get('contextid'))),
625            'modified' => userdate($this->get('modified')),
626            'title' => ($title !== '') ? $title : get_string('notitle', 'search'),
627            'docurl' => $this->get_doc_url(),
628            'content' => $this->is_set('content') ? $this->format_text($this->get('content')) : null,
629            'contexturl' => $this->get_context_url(),
630            'description1' => $this->is_set('description1') ? $this->format_text($this->get('description1')) : null,
631            'description2' => $this->is_set('description2') ? $this->format_text($this->get('description2')) : null,
632        ];
633
634        // Now take any attached any files.
635        $files = $this->get_files();
636        if (!empty($files)) {
637            if (count($files) > 1) {
638                $filenames = array();
639                foreach ($files as $file) {
640                    $filenames[] = format_string($file->get_filename(), true, array('context' => $this->get('contextid')));
641                }
642                $data['multiplefiles'] = true;
643                $data['filenames'] = $filenames;
644            } else {
645                $file = reset($files);
646                $data['filename'] = format_string($file->get_filename(), true, array('context' => $this->get('contextid')));
647            }
648        }
649
650        if ($this->is_set('userid')) {
651            $data['userurl'] = new \moodle_url('/user/view.php', array('id' => $this->get('userid'), 'course' => $this->get('courseid')));
652            $data['userfullname'] = format_string($this->get('userfullname'), true, array('context' => $this->get('contextid')));
653        }
654
655        if ($docicon = $this->get_doc_icon()) {
656            $data['icon'] = $output->image_url($docicon->get_name(), $docicon->get_component());
657        }
658
659        return $data;
660    }
661
662    /**
663     * Formats a text string coming from the search engine.
664     *
665     * By default just return the text as it is:
666     * - Search areas are responsible of sending just plain data, the search engine may
667     *   append HTML or markdown to it (highlighing for example).
668     * - The view is responsible of shortening the text if it is too big
669     *
670     * @param  string $text Text to format
671     * @return string HTML text to be renderer
672     */
673    protected function format_text($text) {
674        return format_text($text, $this->get_text_format(), array('context' => $this->get('contextid')));
675    }
676}
677