1<?php 2// This file is part of Moodle - http://moodle.org/ 3// 4// Moodle is free software: you can redistribute it and/or modify 5// it under the terms of the GNU General Public License as published by 6// the Free Software Foundation, either version 3 of the License, or 7// (at your option) any later version. 8// 9// Moodle is distributed in the hope that it will be useful, 10// but WITHOUT ANY WARRANTY; without even the implied warranty of 11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12// GNU General Public License for more details. 13// 14// You should have received a copy of the GNU General Public License 15// along with Moodle. If not, see <http://www.gnu.org/licenses/>. 16 17/** 18 * Document representation. 19 * 20 * @package core_search 21 * @copyright 2015 David Monllao {@link http://www.davidmonllao.com} 22 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later 23 */ 24 25namespace core_search; 26 27defined('MOODLE_INTERNAL') || die(); 28 29/** 30 * Represents a document to index. 31 * 32 * Note that, if you are writting a search engine and you want to change \core_search\document 33 * behaviour, you can overwrite this class, will be automatically loaded from \search_YOURENGINE\document. 34 * 35 * @package core_search 36 * @copyright 2015 David Monllao {@link http://www.davidmonllao.com} 37 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later 38 */ 39class document implements \renderable, \templatable { 40 41 /** 42 * @var array $data The document data. 43 */ 44 protected $data = array(); 45 46 /** 47 * @var array Extra data needed to render the document. 48 */ 49 protected $extradata = array(); 50 51 /** 52 * @var \moodle_url Link to the document. 53 */ 54 protected $docurl = null; 55 56 /** 57 * @var \moodle_url Link to the document context. 58 */ 59 protected $contexturl = null; 60 61 /** 62 * @var \core_search\document_icon Document icon instance. 63 */ 64 protected $docicon = null; 65 66 /** 67 * @var int|null The content field filearea. 68 */ 69 protected $contentfilearea = null; 70 71 /** 72 * @var int|null The content field itemid. 73 */ 74 protected $contentitemid = null; 75 76 /** 77 * @var bool Should be set to true if document hasn't been indexed before. False if unknown. 78 */ 79 protected $isnew = false; 80 81 /** 82 * @var \stored_file[] An array of stored files to attach to the document. 83 */ 84 protected $files = array(); 85 86 /** 87 * Change list (for engine implementers): 88 * 2017091700 - add optional field groupid 89 * 90 * @var int Schema version number (update if any change) 91 */ 92 const SCHEMA_VERSION = 2017091700; 93 94 /** 95 * All required fields any doc should contain. 96 * 97 * We have to choose a format to specify field types, using solr format as we have to choose one and solr is the 98 * default search engine. 99 * 100 * Search engine plugins are responsible of setting their appropriate field types and map these naming to whatever format 101 * they need. 102 * 103 * @var array 104 */ 105 protected static $requiredfields = array( 106 'id' => array( 107 'type' => 'string', 108 'stored' => true, 109 'indexed' => false 110 ), 111 'itemid' => array( 112 'type' => 'int', 113 'stored' => true, 114 'indexed' => true 115 ), 116 'title' => array( 117 'type' => 'text', 118 'stored' => true, 119 'indexed' => true, 120 'mainquery' => true 121 ), 122 'content' => array( 123 'type' => 'text', 124 'stored' => true, 125 'indexed' => true, 126 'mainquery' => true 127 ), 128 'contextid' => array( 129 'type' => 'int', 130 'stored' => true, 131 'indexed' => true 132 ), 133 'areaid' => array( 134 'type' => 'string', 135 'stored' => true, 136 'indexed' => true 137 ), 138 'type' => array( 139 'type' => 'int', 140 'stored' => true, 141 'indexed' => true 142 ), 143 'courseid' => array( 144 'type' => 'int', 145 'stored' => true, 146 'indexed' => true 147 ), 148 'owneruserid' => array( 149 'type' => 'int', 150 'stored' => true, 151 'indexed' => true 152 ), 153 'modified' => array( 154 'type' => 'tdate', 155 'stored' => true, 156 'indexed' => true 157 ), 158 ); 159 160 /** 161 * All optional fields docs can contain. 162 * 163 * Although it matches solr fields format, this is just to define the field types. Search 164 * engine plugins are responsible of setting their appropriate field types and map these 165 * naming to whatever format they need. 166 * 167 * @var array 168 */ 169 protected static $optionalfields = array( 170 'userid' => array( 171 'type' => 'int', 172 'stored' => true, 173 'indexed' => true 174 ), 175 'groupid' => array( 176 'type' => 'int', 177 'stored' => true, 178 'indexed' => true 179 ), 180 'description1' => array( 181 'type' => 'text', 182 'stored' => true, 183 'indexed' => true, 184 'mainquery' => true 185 ), 186 'description2' => array( 187 'type' => 'text', 188 'stored' => true, 189 'indexed' => true, 190 'mainquery' => true 191 ) 192 ); 193 194 /** 195 * Any fields that are engine specifc. These are fields that are solely used by a search engine plugin 196 * for internal purposes. 197 * 198 * Field names should be prefixed with engine name to avoid potential conflict with core fields. 199 * 200 * Uses same format as fields above. 201 * 202 * @var array 203 */ 204 protected static $enginefields = array(); 205 206 /** 207 * We ensure that the document has a unique id across search areas. 208 * 209 * @param int $itemid An id unique to the search area 210 * @param string $componentname The search area component Frankenstyle name 211 * @param string $areaname The area name (the search area class name) 212 * @return void 213 */ 214 public function __construct($itemid, $componentname, $areaname) { 215 216 if (!is_numeric($itemid)) { 217 throw new \coding_exception('The itemid should be an integer'); 218 } 219 220 $this->data['areaid'] = \core_search\manager::generate_areaid($componentname, $areaname); 221 $this->data['id'] = $this->data['areaid'] . '-' . $itemid; 222 $this->data['itemid'] = intval($itemid); 223 } 224 225 /** 226 * Add a stored file to the document. 227 * 228 * @param \stored_file|int $file The file to add, or file id. 229 * @return void 230 */ 231 public function add_stored_file($file) { 232 if (is_numeric($file)) { 233 $this->files[$file] = $file; 234 } else { 235 $this->files[$file->get_id()] = $file; 236 } 237 } 238 239 /** 240 * Returns the array of attached files. 241 * 242 * @return \stored_file[] 243 */ 244 public function get_files() { 245 // The files array can contain stored file ids, so we need to get instances if asked. 246 foreach ($this->files as $id => $listfile) { 247 if (is_numeric($listfile)) { 248 $fs = get_file_storage(); 249 250 if ($file = $fs->get_file_by_id($id)) { 251 $this->files[$id] = $file; 252 } else { 253 unset($this->files[$id]); // Index is out of date and referencing a file that does not exist. 254 } 255 } 256 } 257 258 return $this->files; 259 } 260 261 /** 262 * Setter. 263 * 264 * Basic checkings to prevent common issues. 265 * 266 * If the field is a string tags will be stripped, if it is an integer or a date it 267 * will be casted to a PHP integer. tdate fields values are expected to be timestamps. 268 * 269 * @throws \coding_exception 270 * @param string $fieldname The field name 271 * @param string|int $value The value to store 272 * @return string|int The stored value 273 */ 274 public function set($fieldname, $value) { 275 276 if (!empty(static::$requiredfields[$fieldname])) { 277 $fielddata = static::$requiredfields[$fieldname]; 278 } else if (!empty(static::$optionalfields[$fieldname])) { 279 $fielddata = static::$optionalfields[$fieldname]; 280 } else if (!empty(static::$enginefields[$fieldname])) { 281 $fielddata = static::$enginefields[$fieldname]; 282 } 283 284 if (empty($fielddata)) { 285 throw new \coding_exception('"' . $fieldname . '" field does not exist.'); 286 } 287 288 // tdate fields should be set as timestamps, later they might be converted to 289 // a date format, it depends on the search engine. 290 if (($fielddata['type'] === 'int' || $fielddata['type'] === 'tdate') && !is_numeric($value)) { 291 throw new \coding_exception('"' . $fieldname . '" value should be an integer and its value is "' . $value . '"'); 292 } 293 294 // We want to be strict here, there might be engines that expect us to 295 // provide them data with the proper type already set. 296 if ($fielddata['type'] === 'int' || $fielddata['type'] === 'tdate') { 297 $this->data[$fieldname] = intval($value); 298 } else { 299 // Remove disallowed Unicode characters. 300 $value = \core_text::remove_unicode_non_characters($value); 301 302 // Replace all groups of line breaks and spaces by single spaces. 303 $this->data[$fieldname] = preg_replace("/\s+/u", " ", $value); 304 if ($this->data[$fieldname] === null) { 305 if (isset($this->data['id'])) { 306 $docid = $this->data['id']; 307 } else { 308 $docid = '(unknown)'; 309 } 310 throw new \moodle_exception('error_indexing', 'search', '', null, '"' . $fieldname . 311 '" value causes preg_replace error (may be caused by unusual characters) ' . 312 'in document with id "' . $docid . '"'); 313 } 314 } 315 316 return $this->data[$fieldname]; 317 } 318 319 /** 320 * Sets data to this->extradata 321 * 322 * This data can be retrieved using \core_search\document->get($fieldname). 323 * 324 * @param string $fieldname 325 * @param string $value 326 * @return void 327 */ 328 public function set_extra($fieldname, $value) { 329 $this->extradata[$fieldname] = $value; 330 } 331 332 /** 333 * Getter. 334 * 335 * Use self::is_set if you are not sure if this field is set or not 336 * as otherwise it will trigger a \coding_exception 337 * 338 * @throws \coding_exception 339 * @param string $field 340 * @return string|int 341 */ 342 public function get($field) { 343 344 if (isset($this->data[$field])) { 345 return $this->data[$field]; 346 } 347 348 // Fallback to extra data. 349 if (isset($this->extradata[$field])) { 350 return $this->extradata[$field]; 351 } 352 353 throw new \coding_exception('Field "' . $field . '" is not set in the document'); 354 } 355 356 /** 357 * Checks if a field is set. 358 * 359 * @param string $field 360 * @return bool 361 */ 362 public function is_set($field) { 363 return (isset($this->data[$field]) || isset($this->extradata[$field])); 364 } 365 366 /** 367 * Set if this is a new document. False if unknown. 368 * 369 * @param bool $new 370 */ 371 public function set_is_new($new) { 372 $this->isnew = (bool)$new; 373 } 374 375 /** 376 * Returns if the document is new. False if unknown. 377 * 378 * @return bool 379 */ 380 public function get_is_new() { 381 return $this->isnew; 382 } 383 384 /** 385 * Returns all default fields definitions. 386 * 387 * @return array 388 */ 389 public static function get_default_fields_definition() { 390 return static::$requiredfields + static::$optionalfields + static::$enginefields; 391 } 392 393 /** 394 * Formats the timestamp preparing the time fields to be inserted into the search engine. 395 * 396 * By default it just returns a timestamp so any search engine could just store integers 397 * and use integers comparison to get documents between x and y timestamps, but search 398 * engines might be interested in using their own field formats. They can do it extending 399 * this class in \search_xxx\document. 400 * 401 * @param int $timestamp 402 * @return string 403 */ 404 public static function format_time_for_engine($timestamp) { 405 return $timestamp; 406 } 407 408 /** 409 * Formats a string value for the search engine. 410 * 411 * Search engines may overwrite this method to apply restrictions, like limiting the size. 412 * The default behaviour is just returning the string. 413 * 414 * @param string $string 415 * @return string 416 */ 417 public static function format_string_for_engine($string) { 418 return $string; 419 } 420 421 /** 422 * Formats a text value for the search engine. 423 * 424 * Search engines may overwrite this method to apply restrictions, like limiting the size. 425 * The default behaviour is just returning the string. 426 * 427 * @param string $text 428 * @return string 429 */ 430 public static function format_text_for_engine($text) { 431 return $text; 432 } 433 434 /** 435 * Returns a timestamp from the value stored in the search engine. 436 * 437 * By default it just returns a timestamp so any search engine could just store integers 438 * and use integers comparison to get documents between x and y timestamps, but search 439 * engines might be interested in using their own field formats. They should do it extending 440 * this class in \search_xxx\document. 441 * 442 * @param string $time 443 * @return int 444 */ 445 public static function import_time_from_engine($time) { 446 return $time; 447 } 448 449 /** 450 * Returns how text is returned from the search engine. 451 * 452 * @return int 453 */ 454 protected function get_text_format() { 455 return FORMAT_PLAIN; 456 } 457 458 /** 459 * Fills the document with data coming from the search engine. 460 * 461 * @throws \core_search\engine_exception 462 * @param array $docdata 463 * @return void 464 */ 465 public function set_data_from_engine($docdata) { 466 $fields = static::$requiredfields + static::$optionalfields + static::$enginefields; 467 foreach ($fields as $fieldname => $field) { 468 469 // Optional params might not be there. 470 if (isset($docdata[$fieldname])) { 471 if ($field['type'] === 'tdate') { 472 // Time fields may need a preprocessing. 473 $this->set($fieldname, static::import_time_from_engine($docdata[$fieldname])); 474 } else { 475 // No way we can make this work if there is any multivalue field. 476 if (is_array($docdata[$fieldname])) { 477 throw new \core_search\engine_exception('multivaluedfield', 'search_solr', '', $fieldname); 478 } 479 $this->set($fieldname, $docdata[$fieldname]); 480 } 481 } 482 } 483 } 484 485 /** 486 * Sets the document url. 487 * 488 * @param \moodle_url $url 489 * @return void 490 */ 491 public function set_doc_url(\moodle_url $url) { 492 $this->docurl = $url; 493 } 494 495 /** 496 * Gets the url to the doc. 497 * 498 * @return \moodle_url 499 */ 500 public function get_doc_url() { 501 return $this->docurl; 502 } 503 504 /** 505 * Sets document icon instance. 506 * 507 * @param \core_search\document_icon $docicon 508 */ 509 public function set_doc_icon(document_icon $docicon) { 510 $this->docicon = $docicon; 511 } 512 513 /** 514 * Gets document icon instance. 515 * 516 * @return \core_search\document_icon 517 */ 518 public function get_doc_icon() { 519 return $this->docicon; 520 } 521 522 public function set_context_url(\moodle_url $url) { 523 $this->contexturl = $url; 524 } 525 526 /** 527 * Gets the url to the context. 528 * 529 * @return \moodle_url 530 */ 531 public function get_context_url() { 532 return $this->contexturl; 533 } 534 535 /** 536 * Returns the document ready to submit to the search engine. 537 * 538 * @throws \coding_exception 539 * @return array 540 */ 541 public function export_for_engine() { 542 // Set any unset defaults. 543 $this->apply_defaults(); 544 545 // We don't want to affect the document instance. 546 $data = $this->data; 547 548 // Apply specific engine-dependant formats and restrictions. 549 foreach (static::$requiredfields as $fieldname => $field) { 550 551 // We also check that we have everything we need. 552 if (!isset($data[$fieldname])) { 553 throw new \coding_exception('Missing "' . $fieldname . '" field in document with id "' . $this->data['id'] . '"'); 554 } 555 556 if ($field['type'] === 'tdate') { 557 // Overwrite the timestamp with the engine dependant format. 558 $data[$fieldname] = static::format_time_for_engine($data[$fieldname]); 559 } else if ($field['type'] === 'string') { 560 // Overwrite the string with the engine dependant format. 561 $data[$fieldname] = static::format_string_for_engine($data[$fieldname]); 562 } else if ($field['type'] === 'text') { 563 // Overwrite the text with the engine dependant format. 564 $data[$fieldname] = static::format_text_for_engine($data[$fieldname]); 565 } 566 567 } 568 569 $fields = static::$optionalfields + static::$enginefields; 570 foreach ($fields as $fieldname => $field) { 571 if (!isset($data[$fieldname])) { 572 continue; 573 } 574 if ($field['type'] === 'tdate') { 575 // Overwrite the timestamp with the engine dependant format. 576 $data[$fieldname] = static::format_time_for_engine($data[$fieldname]); 577 } else if ($field['type'] === 'string') { 578 // Overwrite the string with the engine dependant format. 579 $data[$fieldname] = static::format_string_for_engine($data[$fieldname]); 580 } else if ($field['type'] === 'text') { 581 // Overwrite the text with the engine dependant format. 582 $data[$fieldname] = static::format_text_for_engine($data[$fieldname]); 583 } 584 } 585 586 return $data; 587 } 588 589 /** 590 * Apply any defaults to unset fields before export. Called after document building, but before export. 591 * 592 * Sub-classes of this should make sure to call parent::apply_defaults(). 593 */ 594 protected function apply_defaults() { 595 // Set the default type, TYPE_TEXT. 596 if (!isset($this->data['type'])) { 597 $this->data['type'] = manager::TYPE_TEXT; 598 } 599 } 600 601 /** 602 * Export the document data to be used as a template context. 603 * 604 * Adding more info than the required one as people might be interested in extending the template. 605 * 606 * Although content is a required field when setting up the document, it accepts '' (empty) values 607 * as they may be the result of striping out HTML. 608 * 609 * SECURITY NOTE: It is the responsibility of the document to properly escape any text to be displayed. 610 * The renderer will output the content without any further cleaning. 611 * 612 * @param renderer_base $output The renderer. 613 * @return array 614 */ 615 public function export_for_template(\renderer_base $output) { 616 list($componentname, $areaname) = \core_search\manager::extract_areaid_parts($this->get('areaid')); 617 618 $searcharea = \core_search\manager::get_search_area($this->data['areaid']); 619 $title = $this->is_set('title') ? $this->format_text($searcharea->get_document_display_title($this)) : ''; 620 $data = [ 621 'componentname' => $componentname, 622 'areaname' => $areaname, 623 'courseurl' => course_get_url($this->get('courseid')), 624 'coursefullname' => format_string($this->get('coursefullname'), true, array('context' => $this->get('contextid'))), 625 'modified' => userdate($this->get('modified')), 626 'title' => ($title !== '') ? $title : get_string('notitle', 'search'), 627 'docurl' => $this->get_doc_url(), 628 'content' => $this->is_set('content') ? $this->format_text($this->get('content')) : null, 629 'contexturl' => $this->get_context_url(), 630 'description1' => $this->is_set('description1') ? $this->format_text($this->get('description1')) : null, 631 'description2' => $this->is_set('description2') ? $this->format_text($this->get('description2')) : null, 632 ]; 633 634 // Now take any attached any files. 635 $files = $this->get_files(); 636 if (!empty($files)) { 637 if (count($files) > 1) { 638 $filenames = array(); 639 foreach ($files as $file) { 640 $filenames[] = format_string($file->get_filename(), true, array('context' => $this->get('contextid'))); 641 } 642 $data['multiplefiles'] = true; 643 $data['filenames'] = $filenames; 644 } else { 645 $file = reset($files); 646 $data['filename'] = format_string($file->get_filename(), true, array('context' => $this->get('contextid'))); 647 } 648 } 649 650 if ($this->is_set('userid')) { 651 $data['userurl'] = new \moodle_url('/user/view.php', array('id' => $this->get('userid'), 'course' => $this->get('courseid'))); 652 $data['userfullname'] = format_string($this->get('userfullname'), true, array('context' => $this->get('contextid'))); 653 } 654 655 if ($docicon = $this->get_doc_icon()) { 656 $data['icon'] = $output->image_url($docicon->get_name(), $docicon->get_component()); 657 } 658 659 return $data; 660 } 661 662 /** 663 * Formats a text string coming from the search engine. 664 * 665 * By default just return the text as it is: 666 * - Search areas are responsible of sending just plain data, the search engine may 667 * append HTML or markdown to it (highlighing for example). 668 * - The view is responsible of shortening the text if it is too big 669 * 670 * @param string $text Text to format 671 * @return string HTML text to be renderer 672 */ 673 protected function format_text($text) { 674 return format_text($text, $this->get_text_format(), array('context' => $this->get('contextid'))); 675 } 676} 677