1<?php 2/** 3 * @package Joomla.Administrator 4 * @subpackage com_finder 5 * 6 * @copyright Copyright (C) 2005 - 2020 Open Source Matters, Inc. All rights reserved. 7 * @license GNU General Public License version 2 or later; see LICENSE.txt 8 */ 9 10defined('_JEXEC') or die; 11 12use Joomla\Registry\Registry; 13use Joomla\String\StringHelper; 14 15JLoader::register('FinderIndexerParser', __DIR__ . '/parser.php'); 16JLoader::register('FinderIndexerStemmer', __DIR__ . '/stemmer.php'); 17JLoader::register('FinderIndexerToken', __DIR__ . '/token.php'); 18 19/** 20 * Helper class for the Finder indexer package. 21 * 22 * @since 2.5 23 */ 24class FinderIndexerHelper 25{ 26 /** 27 * The token stemmer object. The stemmer is set by whatever class 28 * wishes to use it but it must be an instance of FinderIndexerStemmer. 29 * 30 * @var FinderIndexerStemmer 31 * @since 2.5 32 */ 33 public static $stemmer; 34 35 /** 36 * A state flag, in order to not constantly check if the stemmer is an instance of FinderIndexerStemmer 37 * 38 * @var boolean 39 * @since 3.7.0 40 */ 41 protected static $stemmerOK; 42 43 /** 44 * Method to parse input into plain text. 45 * 46 * @param string $input The raw input. 47 * @param string $format The format of the input. [optional] 48 * 49 * @return string The parsed input. 50 * 51 * @since 2.5 52 * @throws Exception on invalid parser. 53 */ 54 public static function parse($input, $format = 'html') 55 { 56 // Get a parser for the specified format and parse the input. 57 return FinderIndexerParser::getInstance($format)->parse($input); 58 } 59 60 /** 61 * Method to tokenize a text string. 62 * 63 * @param string $input The input to tokenize. 64 * @param string $lang The language of the input. 65 * @param boolean $phrase Flag to indicate whether input could be a phrase. [optional] 66 * 67 * @return array|FinderIndexerToken An array of FinderIndexerToken objects or a single FinderIndexerToken object. 68 * 69 * @since 2.5 70 */ 71 public static function tokenize($input, $lang, $phrase = false) 72 { 73 static $cache; 74 $store = StringHelper::strlen($input) < 128 ? md5($input . '::' . $lang . '::' . $phrase) : null; 75 76 // Check if the string has been tokenized already. 77 if ($store && isset($cache[$store])) 78 { 79 return $cache[$store]; 80 } 81 82 $tokens = array(); 83 $quotes = html_entity_decode('‘’'', ENT_QUOTES, 'UTF-8'); 84 85 // Get the simple language key. 86 $lang = static::getPrimaryLanguage($lang); 87 88 /* 89 * Parsing the string input into terms is a multi-step process. 90 * 91 * Regexes: 92 * 1. Remove everything except letters, numbers, quotes, apostrophe, plus, dash, period, and comma. 93 * 2. Remove plus, dash, period, and comma characters located before letter characters. 94 * 3. Remove plus, dash, period, and comma characters located after other characters. 95 * 4. Remove plus, period, and comma characters enclosed in alphabetical characters. Ungreedy. 96 * 5. Remove orphaned apostrophe, plus, dash, period, and comma characters. 97 * 6. Remove orphaned quote characters. 98 * 7. Replace the assorted single quotation marks with the ASCII standard single quotation. 99 * 8. Remove multiple space characters and replaces with a single space. 100 */ 101 $input = StringHelper::strtolower($input); 102 $input = preg_replace('#[^\pL\pM\pN\p{Pi}\p{Pf}\'+-.,]+#mui', ' ', $input); 103 $input = preg_replace('#(^|\s)[+-.,]+([\pL\pM]+)#mui', ' $1', $input); 104 $input = preg_replace('#([\pL\pM\pN]+)[+-.,]+(\s|$)#mui', '$1 ', $input); 105 $input = preg_replace('#([\pL\pM]+)[+.,]+([\pL\pM]+)#muiU', '$1 $2', $input); 106 $input = preg_replace('#(^|\s)[\'+-.,]+(\s|$)#mui', ' ', $input); 107 $input = preg_replace('#(^|\s)[\p{Pi}\p{Pf}]+(\s|$)#mui', ' ', $input); 108 $input = preg_replace('#[' . $quotes . ']+#mui', '\'', $input); 109 $input = preg_replace('#\s+#mui', ' ', $input); 110 $input = trim($input); 111 112 // Explode the normalized string to get the terms. 113 $terms = explode(' ', $input); 114 115 /* 116 * If we have Unicode support and are dealing with Chinese text, Chinese 117 * has to be handled specially because there are not necessarily any spaces 118 * between the "words". So, we have to test if the words belong to the Chinese 119 * character set and if so, explode them into single glyphs or "words". 120 */ 121 if ($lang === 'zh') 122 { 123 // Iterate through the terms and test if they contain Chinese. 124 for ($i = 0, $n = count($terms); $i < $n; $i++) 125 { 126 $charMatches = array(); 127 $charCount = preg_match_all('#[\p{Han}]#mui', $terms[$i], $charMatches); 128 129 // Split apart any groups of Chinese characters. 130 for ($j = 0; $j < $charCount; $j++) 131 { 132 $tSplit = StringHelper::str_ireplace($charMatches[0][$j], '', $terms[$i], false); 133 134 if ((bool) $tSplit) 135 { 136 $terms[$i] = $tSplit; 137 } 138 else 139 { 140 unset($terms[$i]); 141 } 142 143 $terms[] = $charMatches[0][$j]; 144 } 145 } 146 147 // Reset array keys. 148 $terms = array_values($terms); 149 } 150 151 /* 152 * If we have to handle the input as a phrase, that means we don't 153 * tokenize the individual terms and we do not create the two and three 154 * term combinations. The phrase must contain more than one word! 155 */ 156 if ($phrase === true && count($terms) > 1) 157 { 158 // Create tokens from the phrase. 159 $tokens[] = new FinderIndexerToken($terms, $lang); 160 } 161 else 162 { 163 // Create tokens from the terms. 164 for ($i = 0, $n = count($terms); $i < $n; $i++) 165 { 166 $tokens[] = new FinderIndexerToken($terms[$i], $lang); 167 } 168 169 // Create two and three word phrase tokens from the individual words. 170 for ($i = 0, $n = count($tokens); $i < $n; $i++) 171 { 172 // Setup the phrase positions. 173 $i2 = $i + 1; 174 $i3 = $i + 2; 175 176 // Create the two word phrase. 177 if ($i2 < $n && isset($tokens[$i2])) 178 { 179 // Tokenize the two word phrase. 180 $token = new FinderIndexerToken( 181 array( 182 $tokens[$i]->term, 183 $tokens[$i2]->term 184 ), $lang, $lang === 'zh' ? '' : ' ' 185 ); 186 $token->derived = true; 187 188 // Add the token to the stack. 189 $tokens[] = $token; 190 } 191 192 // Create the three word phrase. 193 if ($i3 < $n && isset($tokens[$i3])) 194 { 195 // Tokenize the three word phrase. 196 $token = new FinderIndexerToken( 197 array( 198 $tokens[$i]->term, 199 $tokens[$i2]->term, 200 $tokens[$i3]->term 201 ), $lang, $lang === 'zh' ? '' : ' ' 202 ); 203 $token->derived = true; 204 205 // Add the token to the stack. 206 $tokens[] = $token; 207 } 208 } 209 } 210 211 if ($store) 212 { 213 $cache[$store] = count($tokens) > 1 ? $tokens : array_shift($tokens); 214 215 return $cache[$store]; 216 } 217 else 218 { 219 return count($tokens) > 1 ? $tokens : array_shift($tokens); 220 } 221 } 222 223 /** 224 * Method to get the base word of a token. This method uses the public 225 * {@link FinderIndexerHelper::$stemmer} object if it is set. If no stemmer is set, 226 * the original token is returned. 227 * 228 * @param string $token The token to stem. 229 * @param string $lang The language of the token. 230 * 231 * @return string The root token. 232 * 233 * @since 2.5 234 */ 235 public static function stem($token, $lang) 236 { 237 // Trim apostrophes at either end of the token. 238 $token = trim($token, '\''); 239 240 // Trim everything after any apostrophe in the token. 241 if ($res = explode('\'', $token)) 242 { 243 $token = $res[0]; 244 } 245 246 if (static::$stemmerOK === true) 247 { 248 return static::$stemmer->stem($token, $lang); 249 } 250 else 251 { 252 // Stem the token if we have a valid stemmer to use. 253 if (static::$stemmer instanceof FinderIndexerStemmer) 254 { 255 static::$stemmerOK = true; 256 257 return static::$stemmer->stem($token, $lang); 258 } 259 } 260 261 return $token; 262 } 263 264 /** 265 * Method to add a content type to the database. 266 * 267 * @param string $title The type of content. For example: PDF 268 * @param string $mime The mime type of the content. For example: PDF [optional] 269 * 270 * @return integer The id of the content type. 271 * 272 * @since 2.5 273 * @throws Exception on database error. 274 */ 275 public static function addContentType($title, $mime = null) 276 { 277 static $types; 278 279 $db = JFactory::getDbo(); 280 $query = $db->getQuery(true); 281 282 // Check if the types are loaded. 283 if (empty($types)) 284 { 285 // Build the query to get the types. 286 $query->select('*') 287 ->from($db->quoteName('#__finder_types')); 288 289 // Get the types. 290 $db->setQuery($query); 291 $types = $db->loadObjectList('title'); 292 } 293 294 // Check if the type already exists. 295 if (isset($types[$title])) 296 { 297 return (int) $types[$title]->id; 298 } 299 300 // Add the type. 301 $query->clear() 302 ->insert($db->quoteName('#__finder_types')) 303 ->columns(array($db->quoteName('title'), $db->quoteName('mime'))) 304 ->values($db->quote($title) . ', ' . $db->quote($mime)); 305 $db->setQuery($query); 306 $db->execute(); 307 308 // Return the new id. 309 return (int) $db->insertid(); 310 } 311 312 /** 313 * Method to check if a token is common in a language. 314 * 315 * @param string $token The token to test. 316 * @param string $lang The language to reference. 317 * 318 * @return boolean True if common, false otherwise. 319 * 320 * @since 2.5 321 */ 322 public static function isCommon($token, $lang) 323 { 324 static $data; 325 static $default; 326 327 $langCode = $lang; 328 329 // If language requested is wildcard, use the default language. 330 if ($default === null && $lang === '*') 331 { 332 $default = strstr(self::getDefaultLanguage(), '-', true); 333 $langCode = $default; 334 } 335 336 // Load the common tokens for the language if necessary. 337 if (!isset($data[$langCode])) 338 { 339 $data[$langCode] = self::getCommonWords($langCode); 340 } 341 342 // Check if the token is in the common array. 343 return in_array($token, $data[$langCode], true); 344 } 345 346 /** 347 * Method to get an array of common terms for a language. 348 * 349 * @param string $lang The language to use. 350 * 351 * @return array Array of common terms. 352 * 353 * @since 2.5 354 * @throws Exception on database error. 355 */ 356 public static function getCommonWords($lang) 357 { 358 $db = JFactory::getDbo(); 359 360 // Create the query to load all the common terms for the language. 361 $query = $db->getQuery(true) 362 ->select($db->quoteName('term')) 363 ->from($db->quoteName('#__finder_terms_common')) 364 ->where($db->quoteName('language') . ' = ' . $db->quote($lang)); 365 366 // Load all of the common terms for the language. 367 $db->setQuery($query); 368 369 return $db->loadColumn(); 370 } 371 372 /** 373 * Method to get the default language for the site. 374 * 375 * @return string The default language string. 376 * 377 * @since 2.5 378 */ 379 public static function getDefaultLanguage() 380 { 381 static $lang; 382 383 // We need to go to com_languages to get the site default language, it's the best we can guess. 384 if (empty($lang)) 385 { 386 $lang = JComponentHelper::getParams('com_languages')->get('site', 'en-GB'); 387 } 388 389 return $lang; 390 } 391 392 /** 393 * Method to parse a language/locale key and return a simple language string. 394 * 395 * @param string $lang The language/locale key. For example: en-GB 396 * 397 * @return string The simple language string. For example: en 398 * 399 * @since 2.5 400 */ 401 public static function getPrimaryLanguage($lang) 402 { 403 static $data; 404 405 // Only parse the identifier if necessary. 406 if (!isset($data[$lang])) 407 { 408 if (is_callable(array('Locale', 'getPrimaryLanguage'))) 409 { 410 // Get the language key using the Locale package. 411 $data[$lang] = Locale::getPrimaryLanguage($lang); 412 } 413 else 414 { 415 // Get the language key using string position. 416 $data[$lang] = StringHelper::substr($lang, 0, StringHelper::strpos($lang, '-')); 417 } 418 } 419 420 return $data[$lang]; 421 } 422 423 /** 424 * Method to get the path (SEF route) for a content item. 425 * 426 * @param string $url The non-SEF route to the content item. 427 * 428 * @return string The path for the content item. 429 * 430 * @since 2.5 431 * @deprecated 4.0 432 */ 433 public static function getContentPath($url) 434 { 435 static $router; 436 437 // Only get the router once. 438 if (!($router instanceof JRouter)) 439 { 440 // Get and configure the site router. 441 $config = JFactory::getConfig(); 442 $router = JRouter::getInstance('site'); 443 $router->setMode($config->get('sef', 1)); 444 } 445 446 // Build the relative route. 447 $uri = $router->build($url); 448 $route = $uri->toString(array('path', 'query', 'fragment')); 449 $route = str_replace(JUri::base(true) . '/', '', $route); 450 451 return $route; 452 } 453 454 /** 455 * Method to get extra data for a content before being indexed. This is how 456 * we add Comments, Tags, Labels, etc. that should be available to Finder. 457 * 458 * @param FinderIndexerResult $item The item to index as a FinderIndexerResult object. 459 * 460 * @return boolean True on success, false on failure. 461 * 462 * @since 2.5 463 * @throws Exception on database error. 464 */ 465 public static function getContentExtras(FinderIndexerResult $item) 466 { 467 // Get the event dispatcher. 468 $dispatcher = JEventDispatcher::getInstance(); 469 470 // Load the finder plugin group. 471 JPluginHelper::importPlugin('finder'); 472 473 // Trigger the event. 474 $results = $dispatcher->trigger('onPrepareFinderContent', array(&$item)); 475 476 // Check the returned results. This is for plugins that don't throw 477 // exceptions when they encounter serious errors. 478 if (in_array(false, $results)) 479 { 480 throw new Exception($dispatcher->getError(), 500); 481 } 482 483 return true; 484 } 485 486 /** 487 * Method to process content text using the onContentPrepare event trigger. 488 * 489 * @param string $text The content to process. 490 * @param Registry $params The parameters object. [optional] 491 * @param FinderIndexerResult $item The item which get prepared. [optional] 492 * 493 * @return string The processed content. 494 * 495 * @since 2.5 496 */ 497 public static function prepareContent($text, $params = null, FinderIndexerResult $item = null) 498 { 499 static $loaded; 500 501 // Get the dispatcher. 502 $dispatcher = JEventDispatcher::getInstance(); 503 504 // Load the content plugins if necessary. 505 if (empty($loaded)) 506 { 507 JPluginHelper::importPlugin('content'); 508 $loaded = true; 509 } 510 511 // Instantiate the parameter object if necessary. 512 if (!($params instanceof Registry)) 513 { 514 $registry = new Registry($params); 515 $params = $registry; 516 } 517 518 // Create a mock content object. 519 $content = JTable::getInstance('Content'); 520 $content->text = $text; 521 522 if ($item) 523 { 524 $content->bind((array) $item); 525 $content->bind($item->getElements()); 526 } 527 528 if ($item && !empty($item->context)) 529 { 530 $content->context = $item->context; 531 } 532 533 // Fire the onContentPrepare event. 534 $dispatcher->trigger('onContentPrepare', array('com_finder.indexer', &$content, &$params, 0)); 535 536 return $content->text; 537 } 538} 539