1<?php 2/** 3 * @package Joomla.Administrator 4 * @subpackage com_finder 5 * 6 * @copyright Copyright (C) 2005 - 2020 Open Source Matters, Inc. All rights reserved. 7 * @license GNU General Public License version 2 or later; see LICENSE.txt 8 */ 9 10defined('_JEXEC') or die; 11 12use Joomla\String\StringHelper; 13 14JLoader::register('FinderIndexerHelper', __DIR__ . '/helper.php'); 15JLoader::register('FinderIndexerParser', __DIR__ . '/parser.php'); 16JLoader::register('FinderIndexerStemmer', __DIR__ . '/stemmer.php'); 17JLoader::register('FinderIndexerTaxonomy', __DIR__ . '/taxonomy.php'); 18JLoader::register('FinderIndexerToken', __DIR__ . '/token.php'); 19 20jimport('joomla.filesystem.file'); 21 22/** 23 * Main indexer class for the Finder indexer package. 24 * 25 * The indexer class provides the core functionality of the Finder 26 * search engine. It is responsible for adding and updating the 27 * content links table; extracting and scoring tokens; and maintaining 28 * all referential information for the content. 29 * 30 * Note: All exceptions thrown from within this class should be caught 31 * by the controller. 32 * 33 * @since 2.5 34 */ 35abstract class FinderIndexer 36{ 37 /** 38 * The title context identifier. 39 * 40 * @var integer 41 * @since 2.5 42 */ 43 const TITLE_CONTEXT = 1; 44 45 /** 46 * The text context identifier. 47 * 48 * @var integer 49 * @since 2.5 50 */ 51 const TEXT_CONTEXT = 2; 52 53 /** 54 * The meta context identifier. 55 * 56 * @var integer 57 * @since 2.5 58 */ 59 const META_CONTEXT = 3; 60 61 /** 62 * The path context identifier. 63 * 64 * @var integer 65 * @since 2.5 66 */ 67 const PATH_CONTEXT = 4; 68 69 /** 70 * The misc context identifier. 71 * 72 * @var integer 73 * @since 2.5 74 */ 75 const MISC_CONTEXT = 5; 76 77 /** 78 * The indexer state object. 79 * 80 * @var JObject 81 * @since 2.5 82 */ 83 public static $state; 84 85 /** 86 * The indexer profiler object. 87 * 88 * @var JProfiler 89 * @since 2.5 90 */ 91 public static $profiler; 92 93 /** 94 * Database driver cache. 95 * 96 * @var JDatabaseDriver 97 * @since 3.8.0 98 */ 99 protected $db; 100 101 /** 102 * Reusable Query Template. To be used with clone. 103 * 104 * @var JDatabaseQuery 105 * @since 3.8.0 106 */ 107 protected $addTokensToDbQueryTemplate; 108 109 /** 110 * FinderIndexer constructor. 111 * 112 * @since 3.8.0 113 */ 114 public function __construct() 115 { 116 $this->db = JFactory::getDbo(); 117 118 $db = $this->db; 119 120 /** 121 * Set up query template for addTokensToDb, we will be cloning this template when needed. 122 * This is about twice as fast as calling the clear function or setting up a new object. 123 */ 124 $this->addTokensToDbQueryTemplate = $db->getQuery(true)->insert($db->quoteName('#__finder_tokens')) 125 ->columns( 126 array( 127 $db->quoteName('term'), 128 $db->quoteName('stem'), 129 $db->quoteName('common'), 130 $db->quoteName('phrase'), 131 $db->quoteName('weight'), 132 $db->quoteName('context'), 133 $db->quoteName('language') 134 ) 135 ); 136 } 137 138 /** 139 * Returns a reference to the FinderIndexer object. 140 * 141 * @return FinderIndexer instance based on the database driver 142 * 143 * @since 3.0 144 * @throws RuntimeException if driver class for indexer not present. 145 */ 146 public static function getInstance() 147 { 148 // Setup the adapter for the indexer. 149 $serverType = JFactory::getDbo()->getServerType(); 150 151 // For `mssql` server types, convert the type to `sqlsrv` 152 if ($serverType === 'mssql') 153 { 154 $serverType = 'sqlsrv'; 155 } 156 157 $path = __DIR__ . '/driver/' . $serverType . '.php'; 158 $class = 'FinderIndexerDriver' . ucfirst($serverType); 159 160 // Check if a parser exists for the format. 161 if (file_exists($path)) 162 { 163 // Instantiate the parser. 164 JLoader::register($class, $path); 165 166 return new $class; 167 } 168 169 // Throw invalid format exception. 170 throw new RuntimeException(JText::sprintf('COM_FINDER_INDEXER_INVALID_DRIVER', $serverType)); 171 } 172 173 /** 174 * Method to get the indexer state. 175 * 176 * @return object The indexer state object. 177 * 178 * @since 2.5 179 */ 180 public static function getState() 181 { 182 // First, try to load from the internal state. 183 if ((bool) static::$state) 184 { 185 return static::$state; 186 } 187 188 // If we couldn't load from the internal state, try the session. 189 $session = JFactory::getSession(); 190 $data = $session->get('_finder.state', null); 191 192 // If the state is empty, load the values for the first time. 193 if (empty($data)) 194 { 195 $data = new JObject; 196 197 // Load the default configuration options. 198 $data->options = JComponentHelper::getParams('com_finder'); 199 200 // Setup the weight lookup information. 201 $data->weights = array( 202 self::TITLE_CONTEXT => round($data->options->get('title_multiplier', 1.7), 2), 203 self::TEXT_CONTEXT => round($data->options->get('text_multiplier', 0.7), 2), 204 self::META_CONTEXT => round($data->options->get('meta_multiplier', 1.2), 2), 205 self::PATH_CONTEXT => round($data->options->get('path_multiplier', 2.0), 2), 206 self::MISC_CONTEXT => round($data->options->get('misc_multiplier', 0.3), 2) 207 ); 208 209 // Set the current time as the start time. 210 $data->startTime = JFactory::getDate()->toSql(); 211 212 // Set the remaining default values. 213 $data->batchSize = (int) $data->options->get('batch_size', 50); 214 $data->batchOffset = 0; 215 $data->totalItems = 0; 216 $data->pluginState = array(); 217 } 218 219 // Setup the profiler if debugging is enabled. 220 if (JFactory::getApplication()->get('debug')) 221 { 222 static::$profiler = JProfiler::getInstance('FinderIndexer'); 223 } 224 225 // Setup the stemmer. 226 if ($data->options->get('stem', 1) && $data->options->get('stemmer', 'porter_en')) 227 { 228 FinderIndexerHelper::$stemmer = FinderIndexerStemmer::getInstance($data->options->get('stemmer', 'porter_en')); 229 } 230 231 // Set the state. 232 static::$state = $data; 233 234 return static::$state; 235 } 236 237 /** 238 * Method to set the indexer state. 239 * 240 * @param object $data A new indexer state object. 241 * 242 * @return boolean True on success, false on failure. 243 * 244 * @since 2.5 245 */ 246 public static function setState($data) 247 { 248 // Check the state object. 249 if (empty($data) || !$data instanceof JObject) 250 { 251 return false; 252 } 253 254 // Set the new internal state. 255 static::$state = $data; 256 257 // Set the new session state. 258 JFactory::getSession()->set('_finder.state', $data); 259 260 return true; 261 } 262 263 /** 264 * Method to reset the indexer state. 265 * 266 * @return void 267 * 268 * @since 2.5 269 */ 270 public static function resetState() 271 { 272 // Reset the internal state to null. 273 self::$state = null; 274 275 // Reset the session state to null. 276 JFactory::getSession()->set('_finder.state', null); 277 } 278 279 /** 280 * Method to index a content item. 281 * 282 * @param FinderIndexerResult $item The content item to index. 283 * @param string $format The format of the content. [optional] 284 * 285 * @return integer The ID of the record in the links table. 286 * 287 * @since 2.5 288 * @throws Exception on database error. 289 */ 290 abstract public function index($item, $format = 'html'); 291 292 /** 293 * Method to remove a link from the index. 294 * 295 * @param integer $linkId The id of the link. 296 * 297 * @return boolean True on success. 298 * 299 * @since 2.5 300 * @throws Exception on database error. 301 */ 302 public function remove($linkId) 303 { 304 $db = $this->db; 305 $query = $db->getQuery(true); 306 307 // Update the link counts and remove the mapping records. 308 for ($i = 0; $i <= 15; $i++) 309 { 310 // Update the link counts for the terms. 311 $query->clear() 312 ->update($db->quoteName('#__finder_terms', 't')) 313 ->join('INNER', $db->quoteName('#__finder_links_terms' . dechex($i), 'm') . 314 ' ON ' . $db->quoteName('m.term_id') . ' = ' . $db->quoteName('t.term_id') 315 ) 316 ->set($db->quoteName('links') . ' = ' . $db->quoteName('links') . ' - 1') 317 ->where($db->quoteName('m.link_id') . ' = ' . (int) $linkId); 318 $db->setQuery($query)->execute(); 319 320 // Remove all records from the mapping tables. 321 $query->clear() 322 ->delete($db->quoteName('#__finder_links_terms' . dechex($i))) 323 ->where($db->quoteName('link_id') . ' = ' . (int) $linkId); 324 $db->setQuery($query)->execute(); 325 } 326 327 // Delete all orphaned terms. 328 $query->clear() 329 ->delete($db->quoteName('#__finder_terms')) 330 ->where($db->quoteName('links') . ' <= 0'); 331 $db->setQuery($query)->execute(); 332 333 // Delete the link from the index. 334 $query->clear() 335 ->delete($db->quoteName('#__finder_links')) 336 ->where($db->quoteName('link_id') . ' = ' . (int) $linkId); 337 $db->setQuery($query)->execute(); 338 339 // Remove the taxonomy maps. 340 FinderIndexerTaxonomy::removeMaps($linkId); 341 342 // Remove the orphaned taxonomy nodes. 343 FinderIndexerTaxonomy::removeOrphanNodes(); 344 345 return true; 346 } 347 348 /** 349 * Method to optimize the index. We use this method to remove unused terms 350 * and any other optimizations that might be necessary. 351 * 352 * @return boolean True on success. 353 * 354 * @since 2.5 355 * @throws Exception on database error. 356 */ 357 abstract public function optimize(); 358 359 /** 360 * Method to get a content item's signature. 361 * 362 * @param object $item The content item to index. 363 * 364 * @return string The content item's signature. 365 * 366 * @since 2.5 367 */ 368 protected static function getSignature($item) 369 { 370 // Get the indexer state. 371 $state = static::getState(); 372 373 // Get the relevant configuration variables. 374 $config = array( 375 $state->weights, 376 $state->options->get('stem', 1), 377 $state->options->get('stemmer', 'porter_en') 378 ); 379 380 return md5(serialize(array($item, $config))); 381 } 382 383 /** 384 * Method to parse input, tokenize it, and then add it to the database. 385 * 386 * @param mixed $input String or resource to use as input. A resource input will automatically be chunked to conserve 387 * memory. Strings will be chunked if longer than 2K in size. 388 * @param integer $context The context of the input. See context constants. 389 * @param string $lang The language of the input. 390 * @param string $format The format of the input. 391 * 392 * @return integer The number of tokens extracted from the input. 393 * 394 * @since 2.5 395 */ 396 protected function tokenizeToDb($input, $context, $lang, $format) 397 { 398 $count = 0; 399 $buffer = null; 400 401 if (empty($input)) 402 { 403 return $count; 404 } 405 406 // If the input is a resource, batch the process out. 407 if (is_resource($input)) 408 { 409 // Batch the process out to avoid memory limits. 410 while (!feof($input)) 411 { 412 // Read into the buffer. 413 $buffer .= fread($input, 2048); 414 415 /* 416 * If we haven't reached the end of the file, seek to the last 417 * space character and drop whatever is after that to make sure 418 * we didn't truncate a term while reading the input. 419 */ 420 if (!feof($input)) 421 { 422 // Find the last space character. 423 $ls = strrpos($buffer, ' '); 424 425 // Adjust string based on the last space character. 426 if ($ls) 427 { 428 // Truncate the string to the last space character. 429 $string = substr($buffer, 0, $ls); 430 431 // Adjust the buffer based on the last space for the next iteration and trim. 432 $buffer = StringHelper::trim(substr($buffer, $ls)); 433 } 434 // No space character was found. 435 else 436 { 437 $string = $buffer; 438 } 439 } 440 // We've reached the end of the file, so parse whatever remains. 441 else 442 { 443 $string = $buffer; 444 } 445 446 // Parse, tokenise and add tokens to the database. 447 $count = $this->tokenizeToDbShort($string, $context, $lang, $format, $count); 448 449 unset($string, $tokens); 450 } 451 452 return $count; 453 } 454 455 // Parse, tokenise and add tokens to the database. 456 $count = $this->tokenizeToDbShort($input, $context, $lang, $format, $count); 457 458 return $count; 459 } 460 461 /** 462 * Method to parse input, tokenise it, then add the tokens to the database. 463 * 464 * @param string $input String to parse, tokenise and add to database. 465 * @param integer $context The context of the input. See context constants. 466 * @param string $lang The language of the input. 467 * @param string $format The format of the input. 468 * @param integer $count The number of tokens processed so far. 469 * 470 * @return integer Cumulative number of tokens extracted from the input so far. 471 * 472 * @since 3.7.0 473 */ 474 private function tokenizeToDbShort($input, $context, $lang, $format, $count) 475 { 476 // Parse the input. 477 $input = FinderIndexerHelper::parse($input, $format); 478 479 // Check the input. 480 if (empty($input)) 481 { 482 return $count; 483 } 484 485 // Tokenize the input. 486 $tokens = FinderIndexerHelper::tokenize($input, $lang); 487 488 // Add the tokens to the database. 489 $count += $this->addTokensToDb($tokens, $context); 490 491 // Check if we're approaching the memory limit of the token table. 492 if ($count > static::$state->options->get('memory_table_limit', 30000)) 493 { 494 $this->toggleTables(false); 495 } 496 497 return $count; 498 } 499 500 /** 501 * Method to add a set of tokens to the database. 502 * 503 * @param mixed $tokens An array or single FinderIndexerToken object. 504 * @param mixed $context The context of the tokens. See context constants. [optional] 505 * 506 * @return integer The number of tokens inserted into the database. 507 * 508 * @since 2.5 509 * @throws Exception on database error. 510 */ 511 protected function addTokensToDb($tokens, $context = '') 512 { 513 // Get the database object. 514 $db = $this->db; 515 516 // Count the number of token values. 517 $values = 0; 518 519 if (($tokens instanceof FinderIndexerToken) === false) 520 { 521 // Break into chunks of no more than 1000 items 522 $chunks = count($tokens) > 1000 523 ? array_chunk($tokens, 1000) 524 : array($tokens); 525 526 foreach ($chunks as $chunkTokens) 527 { 528 // Cloning a new query template is twice as fast as calling the clear function 529 $query = clone $this->addTokensToDbQueryTemplate; 530 531 // Iterate through the tokens to create SQL value sets. 532 foreach ($chunkTokens as $token) 533 { 534 $query->values( 535 $db->quote($token->term) . ', ' 536 . $db->quote($token->stem) . ', ' 537 . (int) $token->common . ', ' 538 . (int) $token->phrase . ', ' 539 . $db->escape((float) $token->weight) . ', ' 540 . (int) $context . ', ' 541 . $db->quote($token->language) 542 ); 543 ++$values; 544 } 545 546 $db->setQuery($query)->execute(); 547 } 548 } 549 else 550 { 551 $query = clone $this->addTokensToDbQueryTemplate; 552 553 $query->values( 554 $db->quote($tokens->term) . ', ' 555 . $db->quote($tokens->stem) . ', ' 556 . (int) $tokens->common . ', ' 557 . (int) $tokens->phrase . ', ' 558 . $db->escape((float) $tokens->weight) . ', ' 559 . (int) $context . ', ' 560 . $db->quote($tokens->language) 561 ); 562 ++$values; 563 564 $db->setQuery($query)->execute(); 565 } 566 567 return $values; 568 } 569 570 /** 571 * Method to switch the token tables from Memory tables to Disk tables 572 * when they are close to running out of memory. 573 * Since this is not supported/implemented in all DB-drivers, the default is a stub method, which simply returns true. 574 * 575 * @param boolean $memory Flag to control how they should be toggled. 576 * 577 * @return boolean True on success. 578 * 579 * @since 2.5 580 * @throws Exception on database error. 581 */ 582 protected function toggleTables($memory) 583 { 584 return true; 585 } 586} 587