1<?php 2/** 3 * MySQL search engine 4 * 5 * Copyright (C) 2004 Brion Vibber <brion@pobox.com> 6 * https://www.mediawiki.org/ 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public License along 19 * with this program; if not, write to the Free Software Foundation, Inc., 20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 21 * http://www.gnu.org/copyleft/gpl.html 22 * 23 * @file 24 * @ingroup Search 25 */ 26 27use MediaWiki\MediaWikiServices; 28 29/** 30 * Search engine hook for MySQL 31 * @ingroup Search 32 */ 33class SearchMySQL extends SearchDatabase { 34 protected $strictMatching = true; 35 36 private static $mMinSearchLength; 37 38 /** 39 * Parse the user's query and transform it into two SQL fragments: 40 * a WHERE condition and an ORDER BY expression 41 * 42 * @param string $filteredText 43 * @param string $fulltext 44 * 45 * @return array 46 */ 47 private function parseQuery( $filteredText, $fulltext ) { 48 $lc = $this->legalSearchChars( self::CHARS_NO_SYNTAX ); // Minus syntax chars (" and *) 49 $searchon = ''; 50 $this->searchTerms = []; 51 52 # @todo FIXME: This doesn't handle parenthetical expressions. 53 $m = []; 54 if ( preg_match_all( '/([-+<>~]?)(([' . $lc . ']+)(\*?)|"[^"]*")/', 55 $filteredText, $m, PREG_SET_ORDER ) ) { 56 foreach ( $m as $bits ) { 57 Wikimedia\suppressWarnings(); 58 list( /* all */, $modifier, $term, $nonQuoted, $wildcard ) = $bits; 59 Wikimedia\restoreWarnings(); 60 61 if ( $nonQuoted != '' ) { 62 $term = $nonQuoted; 63 $quote = ''; 64 } else { 65 $term = str_replace( '"', '', $term ); 66 $quote = '"'; 67 } 68 69 if ( $searchon !== '' ) { 70 $searchon .= ' '; 71 } 72 if ( $this->strictMatching && ( $modifier == '' ) ) { 73 // If we leave this out, boolean op defaults to OR which is rarely helpful. 74 $modifier = '+'; 75 } 76 77 // Some languages such as Serbian store the input form in the search index, 78 // so we may need to search for matches in multiple writing system variants. 79 $contLang = MediaWikiServices::getInstance()->getContentLanguage(); 80 $convertedVariants = $contLang->autoConvertToAllVariants( $term ); 81 if ( is_array( $convertedVariants ) ) { 82 $variants = array_unique( array_values( $convertedVariants ) ); 83 } else { 84 $variants = [ $term ]; 85 } 86 87 // The low-level search index does some processing on input to work 88 // around problems with minimum lengths and encoding in MySQL's 89 // fulltext engine. 90 // For Chinese this also inserts spaces between adjacent Han characters. 91 $strippedVariants = array_map( [ $contLang, 'normalizeForSearch' ], $variants ); 92 93 // Some languages such as Chinese force all variants to a canonical 94 // form when stripping to the low-level search index, so to be sure 95 // let's check our variants list for unique items after stripping. 96 $strippedVariants = array_unique( $strippedVariants ); 97 98 $searchon .= $modifier; 99 if ( count( $strippedVariants ) > 1 ) { 100 $searchon .= '('; 101 } 102 foreach ( $strippedVariants as $stripped ) { 103 $stripped = $this->normalizeText( $stripped ); 104 if ( $nonQuoted && strpos( $stripped, ' ' ) !== false ) { 105 // Hack for Chinese: we need to toss in quotes for 106 // multiple-character phrases since normalizeForSearch() 107 // added spaces between them to make word breaks. 108 $stripped = '"' . trim( $stripped ) . '"'; 109 } 110 $searchon .= "$quote$stripped$quote$wildcard "; 111 } 112 if ( count( $strippedVariants ) > 1 ) { 113 $searchon .= ')'; 114 } 115 116 // Match individual terms or quoted phrase in result highlighting... 117 // Note that variants will be introduced in a later stage for highlighting! 118 $regexp = $this->regexTerm( $term, $wildcard ); 119 $this->searchTerms[] = $regexp; 120 } 121 wfDebug( __METHOD__ . ": Would search with '$searchon'" ); 122 wfDebug( __METHOD__ . ': Match with /' . implode( '|', $this->searchTerms ) . "/" ); 123 } else { 124 wfDebug( __METHOD__ . ": Can't understand search query '{$filteredText}'" ); 125 } 126 127 $dbr = $this->lb->getConnectionRef( DB_REPLICA ); 128 $searchon = $dbr->addQuotes( $searchon ); 129 $field = $this->getIndexField( $fulltext ); 130 return [ 131 " MATCH($field) AGAINST($searchon IN BOOLEAN MODE) ", 132 " MATCH($field) AGAINST($searchon IN NATURAL LANGUAGE MODE) DESC " 133 ]; 134 } 135 136 private function regexTerm( $string, $wildcard ) { 137 $regex = preg_quote( $string, '/' ); 138 if ( MediaWikiServices::getInstance()->getContentLanguage()->hasWordBreaks() ) { 139 if ( $wildcard ) { 140 // Don't cut off the final bit! 141 $regex = "\b$regex"; 142 } else { 143 $regex = "\b$regex\b"; 144 } 145 } else { 146 // For Chinese, words may legitimately abut other words in the text literal. 147 // Don't add \b boundary checks... note this could cause false positives 148 // for Latin chars. 149 } 150 return $regex; 151 } 152 153 public function legalSearchChars( $type = self::CHARS_ALL ) { 154 $searchChars = parent::legalSearchChars( $type ); 155 if ( $type === self::CHARS_ALL ) { 156 // " for phrase, * for wildcard 157 $searchChars = "\"*" . $searchChars; 158 } 159 return $searchChars; 160 } 161 162 /** 163 * Perform a full text search query and return a result set. 164 * 165 * @param string $term Raw search term 166 * @return SqlSearchResultSet|null 167 */ 168 protected function doSearchTextInDB( $term ) { 169 return $this->searchInternal( $term, true ); 170 } 171 172 /** 173 * Perform a title-only search query and return a result set. 174 * 175 * @param string $term Raw search term 176 * @return SqlSearchResultSet|null 177 */ 178 protected function doSearchTitleInDB( $term ) { 179 return $this->searchInternal( $term, false ); 180 } 181 182 protected function searchInternal( $term, $fulltext ) { 183 // This seems out of place, why is this called with empty term? 184 if ( trim( $term ) === '' ) { 185 return null; 186 } 187 188 $filteredTerm = $this->filter( $term ); 189 $query = $this->getQuery( $filteredTerm, $fulltext ); 190 $dbr = $this->lb->getConnectionRef( DB_REPLICA ); 191 $resultSet = $dbr->select( 192 $query['tables'], $query['fields'], $query['conds'], 193 __METHOD__, $query['options'], $query['joins'] 194 ); 195 196 $total = null; 197 $query = $this->getCountQuery( $filteredTerm, $fulltext ); 198 $totalResult = $dbr->select( 199 $query['tables'], $query['fields'], $query['conds'], 200 __METHOD__, $query['options'], $query['joins'] 201 ); 202 203 $row = $totalResult->fetchObject(); 204 if ( $row ) { 205 $total = intval( $row->c ); 206 } 207 $totalResult->free(); 208 209 return new SqlSearchResultSet( $resultSet, $this->searchTerms, $total ); 210 } 211 212 public function supports( $feature ) { 213 switch ( $feature ) { 214 case 'title-suffix-filter': 215 return true; 216 default: 217 return parent::supports( $feature ); 218 } 219 } 220 221 /** 222 * Add special conditions 223 * @param array &$query 224 * @since 1.18 225 */ 226 protected function queryFeatures( &$query ) { 227 foreach ( $this->features as $feature => $value ) { 228 if ( $feature === 'title-suffix-filter' && $value ) { 229 $dbr = $this->lb->getConnectionRef( DB_REPLICA ); 230 $query['conds'][] = 'page_title' . $dbr->buildLike( $dbr->anyString(), $value ); 231 } 232 } 233 } 234 235 /** 236 * Add namespace conditions 237 * @param array &$query 238 * @since 1.18 (changed) 239 */ 240 private function queryNamespaces( &$query ) { 241 if ( is_array( $this->namespaces ) ) { 242 if ( count( $this->namespaces ) === 0 ) { 243 $this->namespaces[] = '0'; 244 } 245 $query['conds']['page_namespace'] = $this->namespaces; 246 } 247 } 248 249 /** 250 * Add limit options 251 * @param array &$query 252 * @since 1.18 253 */ 254 protected function limitResult( &$query ) { 255 $query['options']['LIMIT'] = $this->limit; 256 $query['options']['OFFSET'] = $this->offset; 257 } 258 259 /** 260 * Construct the SQL query to do the search. 261 * The guts shoulds be constructed in queryMain() 262 * @param string $filteredTerm 263 * @param bool $fulltext 264 * @return array 265 * @since 1.18 (changed) 266 */ 267 private function getQuery( $filteredTerm, $fulltext ) { 268 $query = [ 269 'tables' => [], 270 'fields' => [], 271 'conds' => [], 272 'options' => [], 273 'joins' => [], 274 ]; 275 276 $this->queryMain( $query, $filteredTerm, $fulltext ); 277 $this->queryFeatures( $query ); 278 $this->queryNamespaces( $query ); 279 $this->limitResult( $query ); 280 281 return $query; 282 } 283 284 /** 285 * Picks which field to index on, depending on what type of query. 286 * @param bool $fulltext 287 * @return string 288 */ 289 private function getIndexField( $fulltext ) { 290 return $fulltext ? 'si_text' : 'si_title'; 291 } 292 293 /** 294 * Get the base part of the search query. 295 * 296 * @param array &$query Search query array 297 * @param string $filteredTerm 298 * @param bool $fulltext 299 * @since 1.18 (changed) 300 */ 301 private function queryMain( &$query, $filteredTerm, $fulltext ) { 302 $match = $this->parseQuery( $filteredTerm, $fulltext ); 303 $query['tables'][] = 'page'; 304 $query['tables'][] = 'searchindex'; 305 $query['fields'][] = 'page_id'; 306 $query['fields'][] = 'page_namespace'; 307 $query['fields'][] = 'page_title'; 308 $query['conds'][] = 'page_id=si_page'; 309 $query['conds'][] = $match[0]; 310 $query['options']['ORDER BY'] = $match[1]; 311 } 312 313 /** 314 * @since 1.18 (changed) 315 * @param string $filteredTerm 316 * @param bool $fulltext 317 * @return array 318 */ 319 private function getCountQuery( $filteredTerm, $fulltext ) { 320 $match = $this->parseQuery( $filteredTerm, $fulltext ); 321 322 $query = [ 323 'tables' => [ 'page', 'searchindex' ], 324 'fields' => [ 'COUNT(*) as c' ], 325 'conds' => [ 'page_id=si_page', $match[0] ], 326 'options' => [], 327 'joins' => [], 328 ]; 329 330 $this->queryFeatures( $query ); 331 $this->queryNamespaces( $query ); 332 333 return $query; 334 } 335 336 /** 337 * Create or update the search index record for the given page. 338 * Title and text should be pre-processed. 339 * 340 * @param int $id 341 * @param string $title 342 * @param string $text 343 */ 344 public function update( $id, $title, $text ) { 345 $dbw = $this->lb->getConnectionRef( DB_MASTER ); 346 $dbw->replace( 347 'searchindex', 348 'si_page', 349 [ 350 'si_page' => $id, 351 'si_title' => $this->normalizeText( $title ), 352 'si_text' => $this->normalizeText( $text ) 353 ], 354 __METHOD__ 355 ); 356 } 357 358 /** 359 * Update a search index record's title only. 360 * Title should be pre-processed. 361 * 362 * @param int $id 363 * @param string $title 364 */ 365 public function updateTitle( $id, $title ) { 366 $dbw = $this->lb->getConnectionRef( DB_MASTER ); 367 $dbw->update( 'searchindex', 368 [ 'si_title' => $this->normalizeText( $title ) ], 369 [ 'si_page' => $id ], 370 __METHOD__ 371 ); 372 } 373 374 /** 375 * Delete an indexed page 376 * Title should be pre-processed. 377 * 378 * @param int $id Page id that was deleted 379 * @param string $title Title of page that was deleted 380 */ 381 public function delete( $id, $title ) { 382 $dbw = $this->lb->getConnectionRef( DB_MASTER ); 383 $dbw->delete( 'searchindex', [ 'si_page' => $id ], __METHOD__ ); 384 } 385 386 /** 387 * Converts some characters for MySQL's indexing to grok it correctly, 388 * and pads short words to overcome limitations. 389 * @param string $string 390 * @return mixed|string 391 */ 392 public function normalizeText( $string ) { 393 $out = parent::normalizeText( $string ); 394 395 // MySQL fulltext index doesn't grok utf-8, so we 396 // need to fold cases and convert to hex 397 $out = preg_replace_callback( 398 "/([\\xc0-\\xff][\\x80-\\xbf]*)/", 399 [ $this, 'stripForSearchCallback' ], 400 MediaWikiServices::getInstance()->getContentLanguage()->lc( $out ) ); 401 402 // And to add insult to injury, the default indexing 403 // ignores short words... Pad them so we can pass them 404 // through without reconfiguring the server... 405 $minLength = $this->minSearchLength(); 406 if ( $minLength > 1 ) { 407 $n = $minLength - 1; 408 $out = preg_replace( 409 "/\b(\w{1,$n})\b/", 410 "$1u800", 411 $out ); 412 } 413 414 // Periods within things like hostnames and IP addresses 415 // are also important -- we want a search for "example.com" 416 // or "192.168.1.1" to work sanely. 417 // MySQL's search seems to ignore them, so you'd match on 418 // "example.wikipedia.com" and "192.168.83.1" as well. 419 $out = preg_replace( 420 "/(\w)\.(\w|\*)/u", 421 "$1u82e$2", 422 $out ); 423 424 return $out; 425 } 426 427 /** 428 * Armor a case-folded UTF-8 string to get through MySQL's 429 * fulltext search without being mucked up by funny charset 430 * settings or anything else of the sort. 431 * @param array $matches 432 * @return string 433 */ 434 protected function stripForSearchCallback( $matches ) { 435 return 'u8' . bin2hex( $matches[1] ); 436 } 437 438 /** 439 * Check MySQL server's ft_min_word_len setting so we know 440 * if we need to pad short words... 441 * 442 * @return int 443 */ 444 protected function minSearchLength() { 445 if ( self::$mMinSearchLength === null ) { 446 $sql = "SHOW GLOBAL VARIABLES LIKE 'ft\\_min\\_word\\_len'"; 447 448 $dbr = $this->lb->getConnectionRef( DB_REPLICA ); 449 $result = $dbr->query( $sql, __METHOD__ ); 450 $row = $result->fetchObject(); 451 $result->free(); 452 453 if ( $row && $row->Variable_name == 'ft_min_word_len' ) { 454 self::$mMinSearchLength = intval( $row->Value ); 455 } else { 456 self::$mMinSearchLength = 0; 457 } 458 } 459 return self::$mMinSearchLength; 460 } 461} 462