1<?php 2/** 3 * Search index updater 4 * 5 * See deferred.txt 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License along 18 * with this program; if not, write to the Free Software Foundation, Inc., 19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 20 * http://www.gnu.org/copyleft/gpl.html 21 * 22 * @file 23 * @ingroup Search 24 */ 25 26use MediaWiki\MediaWikiServices; 27 28/** 29 * Database independent search index updater 30 * 31 * @ingroup Search 32 */ 33class SearchUpdate implements DeferrableUpdate { 34 /** @var int Page id being updated */ 35 private $id = 0; 36 37 /** @var Title Title we're updating */ 38 private $title; 39 40 /** @var Content|null Content of the page (not text) */ 41 private $content; 42 43 /** @var WikiPage */ 44 private $page; 45 46 /** 47 * @param int $id Page id to update 48 * @param Title $title Title of page to update 49 * @param Content|null $c Content of the page to update. 50 */ 51 public function __construct( $id, $title, $c = null ) { 52 if ( is_string( $title ) ) { 53 wfDeprecated( __METHOD__ . " with a string for the title", '1.34' ); 54 $this->title = Title::newFromText( $title ); 55 if ( $this->title === null ) { 56 throw new InvalidArgumentException( "Cannot construct the title: $title" ); 57 } 58 } else { 59 $this->title = $title; 60 } 61 62 $this->id = $id; 63 // is_string() check is back-compat for ApprovedRevs 64 if ( is_string( $c ) ) { 65 wfDeprecated( __METHOD__ . " with a string for the content", '1.34' ); 66 $c = new TextContent( $c ); 67 } elseif ( is_bool( $c ) ) { 68 wfDeprecated( __METHOD__ . " with a boolean for the content", '1.34' ); 69 $c = null; 70 } 71 $this->content = $c; 72 } 73 74 /** 75 * Perform actual update for the entry 76 */ 77 public function doUpdate() { 78 $services = MediaWikiServices::getInstance(); 79 $config = $services->getSearchEngineConfig(); 80 81 if ( $config->getConfig()->get( 'DisableSearchUpdate' ) || !$this->id ) { 82 return; 83 } 84 85 $seFactory = $services->getSearchEngineFactory(); 86 foreach ( $config->getSearchTypes() as $type ) { 87 $search = $seFactory->create( $type ); 88 if ( !$search->supports( 'search-update' ) ) { 89 continue; 90 } 91 92 $normalTitle = $this->getNormalizedTitle( $search ); 93 94 if ( $this->getLatestPage() === null ) { 95 $search->delete( $this->id, $normalTitle ); 96 continue; 97 } elseif ( $this->content === null ) { 98 $search->updateTitle( $this->id, $normalTitle ); 99 continue; 100 } 101 102 $text = $this->content !== null ? $this->content->getTextForSearchIndex() : ''; 103 $text = $this->updateText( $text, $search ); 104 105 # Perform the actual update 106 $search->update( $this->id, $normalTitle, $search->normalizeText( $text ) ); 107 } 108 } 109 110 /** 111 * Clean text for indexing. Only really suitable for indexing in databases. 112 * If you're using a real search engine, you'll probably want to override 113 * this behavior and do something nicer with the original wikitext. 114 * @param string $text 115 * @param SearchEngine|null $se Search engine 116 * @return string 117 */ 118 public function updateText( $text, SearchEngine $se = null ) { 119 $services = MediaWikiServices::getInstance(); 120 $contLang = $services->getContentLanguage(); 121 # Language-specific strip/conversion 122 $text = $contLang->normalizeForSearch( $text ); 123 $se = $se ?: $services->newSearchEngine(); 124 $lc = $se->legalSearchChars() . '&#;'; 125 126 # Strip HTML markup 127 $text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/", 128 ' ', $contLang->lc( " " . $text . " " ) ); 129 $text = preg_replace( "/(^|\\n)==\\s*([^\\n]+)\\s*==(\\s)/sD", 130 "\\1\\2 \\2 \\2\\3", $text ); # Emphasize headings 131 132 # Strip external URLs 133 $uc = "A-Za-z0-9_\\/:.,~%\\-+&;#?!=()@\\x80-\\xFF"; 134 $protos = "http|https|ftp|mailto|news|gopher"; 135 $pat = "/(^|[^\\[])({$protos}):[{$uc}]+([^{$uc}]|$)/"; 136 $text = preg_replace( $pat, "\\1 \\3", $text ); 137 138 $p1 = "/([^\\[])\\[({$protos}):[{$uc}]+]/"; 139 $p2 = "/([^\\[])\\[({$protos}):[{$uc}]+\\s+([^\\]]+)]/"; 140 $text = preg_replace( $p1, "\\1 ", $text ); 141 $text = preg_replace( $p2, "\\1 \\3 ", $text ); 142 143 # Internal image links 144 $pat2 = "/\\[\\[image:([{$uc}]+)\\.(gif|png|jpg|jpeg)([^{$uc}])/i"; 145 $text = preg_replace( $pat2, " \\1 \\3", $text ); 146 147 $text = preg_replace( "/([^{$lc}])([{$lc}]+)]]([a-z]+)/", 148 "\\1\\2 \\2\\3", $text ); # Handle [[game]]s 149 150 # Strip all remaining non-search characters 151 $text = preg_replace( "/[^{$lc}]+/", " ", $text ); 152 153 /** 154 * Handle 's, s' 155 * 156 * $text = preg_replace( "/([{$lc}]+)'s /", "\\1 \\1's ", $text ); 157 * $text = preg_replace( "/([{$lc}]+)s' /", "\\1s ", $text ); 158 * 159 * These tail-anchored regexps are insanely slow. The worst case comes 160 * when Japanese or Chinese text (ie, no word spacing) is written on 161 * a wiki configured for Western UTF-8 mode. The Unicode characters are 162 * expanded to hex codes and the "words" are very long paragraph-length 163 * monstrosities. On a large page the above regexps may take over 20 164 * seconds *each* on a 1GHz-level processor. 165 * 166 * Following are reversed versions which are consistently fast 167 * (about 3 milliseconds on 1GHz-level processor). 168 */ 169 $text = strrev( preg_replace( "/ s'([{$lc}]+)/", " s'\\1 \\1", strrev( $text ) ) ); 170 $text = strrev( preg_replace( "/ 's([{$lc}]+)/", " s\\1", strrev( $text ) ) ); 171 172 # Strip wiki '' and ''' 173 $text = preg_replace( "/''[']*/", " ", $text ); 174 175 return $text; 176 } 177 178 /** 179 * Get WikiPage for the SearchUpdate $id using WikiPage::READ_LATEST 180 * and ensure using the same WikiPage object if there are multiple 181 * SearchEngine types. 182 * 183 * Returns null if a page has been deleted or is not found. 184 * 185 * @return WikiPage|null 186 */ 187 private function getLatestPage() { 188 if ( !isset( $this->page ) ) { 189 $this->page = MediaWikiServices::getInstance()->getWikiPageFactory() 190 ->newFromID( $this->id, WikiPage::READ_LATEST ); 191 } 192 193 return $this->page; 194 } 195 196 /** 197 * Get a normalized string representation of a title suitable for 198 * including in a search index 199 * 200 * @param SearchEngine $search 201 * @return string A stripped-down title string ready for the search index 202 */ 203 private function getNormalizedTitle( SearchEngine $search ) { 204 $contLang = MediaWikiServices::getInstance()->getContentLanguage(); 205 $ns = $this->title->getNamespace(); 206 $title = $this->title->getText(); 207 208 $lc = $search->legalSearchChars() . '&#;'; 209 $t = $contLang->normalizeForSearch( $title ); 210 $t = preg_replace( "/[^{$lc}]+/", ' ', $t ); 211 $t = $contLang->lc( $t ); 212 213 # Handle 's, s' 214 $t = preg_replace( "/([{$lc}]+)'s( |$)/", "\\1 \\1's ", $t ); 215 $t = preg_replace( "/([{$lc}]+)s'( |$)/", "\\1s ", $t ); 216 217 $t = preg_replace( "/\\s+/", ' ', $t ); 218 219 if ( $ns === NS_FILE ) { 220 $t = preg_replace( "/ (png|gif|jpg|jpeg|ogg)$/", "", $t ); 221 } 222 223 return $search->normalizeText( trim( $t ) ); 224 } 225} 226