1<?php 2 3/* 4 * This file is part of the TYPO3 CMS project. 5 * 6 * It is free software; you can redistribute it and/or modify it under 7 * the terms of the GNU General Public License, either version 2 8 * of the License, or any later version. 9 * 10 * For the full copyright and license information, please read the 11 * LICENSE.txt file that was distributed with this source code. 12 * 13 * The TYPO3 project - inspiring people to share! 14 */ 15 16namespace TYPO3\CMS\IndexedSearch; 17 18use TYPO3\CMS\Core\Charset\CharsetConverter; 19use TYPO3\CMS\Core\Utility\GeneralUtility; 20 21/** 22 * Lexer class for indexed_search 23 * A lexer splits the text into words 24 * @internal 25 */ 26class Lexer 27{ 28 protected const CHARTYPE_NUMBER = 'num'; 29 protected const CHARTYPE_ALPHA = 'alpha'; 30 // CJK (Chinese / Japanese / Korean) 31 protected const CHARTYPE_CJK = 'cjk'; 32 33 /** 34 * Debugging options: 35 * 36 * @var bool 37 */ 38 public $debug = false; 39 40 /** 41 * If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display) 42 * 43 * @var string 44 */ 45 public $debugString = ''; 46 47 /** 48 * Configuration of the lexer: 49 * 50 * @var array 51 */ 52 public $lexerConf = [ 53 //Characters: . - _ : / ' 54 'printjoins' => [46, 45, 95, 58, 47, 39], 55 'casesensitive' => false, 56 // Set, if case sensitive indexing is wanted. 57 'removeChars' => [45], 58 ]; 59 60 /** 61 * Splitting string into words. 62 * Used for indexing, can also be used to find words in query. 63 * 64 * @param string $wordString String with UTF-8 content to process. 65 * @return array Array of words in utf-8 66 */ 67 public function split2Words($wordString) 68 { 69 // Reset debug string: 70 $this->debugString = ''; 71 // Then convert the string to lowercase: 72 if (!$this->lexerConf['casesensitive']) { 73 $wordString = mb_strtolower($wordString, 'utf-8'); 74 } 75 // Now, splitting words: 76 $pos = 0; 77 $words = []; 78 $this->debugString = ''; 79 while (1) { 80 [$start, $len] = $this->get_word($wordString, $pos); 81 if ($len) { 82 $this->addWords($words, $wordString, $start, $len); 83 if ($this->debug) { 84 $this->debugString .= '<span style="color:red">' . htmlspecialchars(substr( 85 $wordString, 86 $pos, 87 $start - $pos 88 )) . '</span>' . htmlspecialchars(substr($wordString, $start, $len)); 89 } 90 $pos = $start + $len; 91 } else { 92 break; 93 } 94 } 95 return $words; 96 } 97 98 /********************************** 99 * 100 * Helper functions 101 * 102 ********************************/ 103 /** 104 * Add word to word-array 105 * This function should be used to make sure CJK sequences are split up in the right way 106 * 107 * @param array $words Array of accumulated words 108 * @param string $wordString Complete Input string from where to extract word 109 * @param int $start Start position of word in input string 110 * @param int $len The Length of the word string from start position 111 */ 112 public function addWords(&$words, &$wordString, $start, $len) 113 { 114 // Get word out of string: 115 $theWord = substr($wordString, $start, $len); 116 // Get next chars unicode number and find type: 117 $bc = 0; 118 $cp = $this->utf8_ord($theWord, $bc); 119 $cType = $this->charType((int)$cp); 120 // If string is a CJK sequence we follow this algorithm: 121 /* 122 DESCRIPTION OF (CJK) ALGORITHMContinuous letters and numbers make up words. Spaces and symbols 123 separate letters and numbers into words. This is sufficient for 124 all western text.CJK doesn't use spaces or separators to separate words, so the only 125 way to really find out what constitutes a word would be to have a 126 dictionary and advanced heuristics. Instead, we form pairs from 127 consecutive characters, in such a way that searches will find only 128 characters that appear more-or-less the right sequence. For example:ABCDE => AB BC CD DEThis works okay since both the index and the search query is split 129 in the same manner, and since the set of characters is huge so the 130 extra matches are not significant.(Hint taken from ZOPEs chinese user group)[Kasper: As far as I can see this will only work well with or-searches!] 131 */ 132 if ($cType === self::CHARTYPE_CJK) { 133 // Find total string length: 134 $strlen = mb_strlen($theWord, 'utf-8'); 135 // Traverse string length and add words as pairs of two chars: 136 for ($a = 0; $a < $strlen; $a++) { 137 if ($strlen == 1 || $a < $strlen - 1) { 138 $words[] = mb_substr($theWord, $a, 2, 'utf-8'); 139 } 140 } 141 } else { 142 // Normal "single-byte" chars: 143 // Remove chars: 144 $charsetConverter = GeneralUtility::makeInstance(CharsetConverter::class); 145 foreach ($this->lexerConf['removeChars'] as $skipJoin) { 146 $theWord = str_replace($charsetConverter->UnumberToChar($skipJoin), '', $theWord); 147 } 148 // Add word: 149 $words[] = $theWord; 150 } 151 } 152 153 /** 154 * Get the first word in a given utf-8 string (initial non-letters will be skipped) 155 * 156 * @param string $str Input string (reference) 157 * @param int $pos Starting position in input string 158 * @return array|bool 0: start, 1: len or FALSE if no word has been found 159 */ 160 public function get_word(&$str, $pos = 0) 161 { 162 $len = 0; 163 // If return is TRUE, a word was found starting at this position, so returning position and length: 164 if ($this->utf8_is_letter($str, $len, $pos)) { 165 return [$pos, $len]; 166 } 167 // If the return value was FALSE it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word: 168 $pos += $len; 169 if ((string)($str[$pos] ?? '') === '') { 170 // Check end of string before looking for word of course. 171 return false; 172 } 173 $this->utf8_is_letter($str, $len, $pos); 174 return [$pos, $len]; 175 } 176 177 /** 178 * See if a character is a letter (or a string of letters or non-letters). 179 * 180 * @param string $str Input string (reference) 181 * @param int $len Byte-length of character sequence (reference, return value) 182 * @param int $pos Starting position in input string 183 * @return bool letter (or word) found 184 */ 185 public function utf8_is_letter(&$str, &$len, $pos = 0) 186 { 187 $len = 0; 188 $bc = 0; 189 $cp = 0; 190 $printJoinLgd = 0; 191 $cType = ($cType_prev = false); 192 // Letter type 193 $letter = true; 194 // looking for a letter? 195 if ((string)($str[$pos] ?? '') === '') { 196 // Return FALSE on end-of-string at this stage 197 return false; 198 } 199 while (1) { 200 // If characters has been obtained we will know whether the string starts as a sequence of letters or not: 201 if ($len) { 202 if ($letter) { 203 // We are in a sequence of words 204 if ( 205 !$cType 206 || $cType_prev === self::CHARTYPE_CJK && ($cType === self::CHARTYPE_NUMBER || $cType === self::CHARTYPE_ALPHA) 207 || $cType === self::CHARTYPE_CJK && ($cType_prev === self::CHARTYPE_NUMBER || $cType_prev === self::CHARTYPE_ALPHA) 208 ) { 209 // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word. 210 if (!in_array($cp, $this->lexerConf['printjoins'])) { 211 // If a printjoin start length has been recorded, set that back now so the length is right (filtering out multiple end chars) 212 if ($printJoinLgd) { 213 $len = $printJoinLgd; 214 } 215 return true; 216 } 217 // If a printJoin char is found, record the length if it has not been recorded already: 218 if (!$printJoinLgd) { 219 $printJoinLgd = $len; 220 } 221 } else { 222 // When a true letter is found, reset printJoinLgd counter: 223 $printJoinLgd = 0; 224 } 225 } elseif (!$letter && $cType) { 226 // end of non-word reached 227 return false; 228 } 229 } 230 $len += $bc; 231 // add byte-length of last found character 232 if ((string)($str[$pos] ?? '') === '') { 233 // End of string; return status of string till now 234 return $letter; 235 } 236 // Get next chars unicode number: 237 $cp = $this->utf8_ord($str, $bc, $pos); 238 $pos += $bc; 239 // Determine the type: 240 $cType_prev = $cType; 241 $cType = $this->charType((int)$cp); 242 if ($cType !== null) { 243 continue; 244 } 245 // Setting letter to FALSE if the first char was not a letter! 246 if (!$len) { 247 $letter = false; 248 } 249 } 250 return false; 251 } 252 253 /** 254 * Determine the type of character 255 * 256 * @param int $cp Unicode number to evaluate 257 * @return string|null Type of char; the main type: num, alpha or CJK (Chinese / Japanese / Korean) 258 */ 259 public function charType($cp) 260 { 261 // Numeric? 262 if ($cp >= 48 && $cp <= 57) { 263 return self::CHARTYPE_NUMBER; 264 } 265 // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic): 266 if ($cp >= 65 && $cp <= 90 || $cp >= 97 && $cp <= 122 || $cp >= 192 && $cp <= 255 && $cp != 215 && $cp != 247 || $cp >= 256 && $cp < 640 || ($cp == 902 || $cp >= 904 && $cp < 1024) || ($cp >= 1024 && $cp < 1154 || $cp >= 1162 && $cp < 1328) || ($cp >= 1424 && $cp < 1456 || $cp >= 1488 && $cp < 1523) || ($cp >= 1569 && $cp <= 1624 || $cp >= 1646 && $cp <= 1747) || $cp >= 7680 && $cp < 8192) { 267 return self::CHARTYPE_ALPHA; 268 } 269 // Looking for CJK (Chinese / Japanese / Korean) 270 // Ranges are not certain - deducted from the translation tables in typo3/sysext/core/Resources/Private/Charsets/csconvtbl/ 271 // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete. 272 if ($cp >= 12352 && $cp <= 12543 || $cp >= 12592 && $cp <= 12687 || $cp >= 13312 && $cp <= 19903 || $cp >= 19968 && $cp <= 40879 || $cp >= 44032 && $cp <= 55215 || $cp >= 131072 && $cp <= 195103) { 273 return self::CHARTYPE_CJK; 274 } 275 return null; 276 } 277 278 /** 279 * Converts a UTF-8 multibyte character to a UNICODE codepoint 280 * 281 * @param string $str UTF-8 multibyte character string (reference) 282 * @param int $len The length of the character (reference, return value) 283 * @param int $pos Starting position in input string 284 * @param bool $hex If set, then a hex. number is returned 285 * @return int|string UNICODE codepoint 286 */ 287 public function utf8_ord(&$str, &$len, $pos = 0, $hex = false) 288 { 289 $ord = ord($str[$pos]); 290 $len = 1; 291 if ($ord > 128) { 292 for ($bc = -1, $mbs = $ord; $mbs & 128; $mbs = $mbs << 1) { 293 // calculate number of extra bytes 294 $bc++; 295 } 296 $len += $bc; 297 $ord = $ord & (1 << 6 - $bc) - 1; 298 // mask utf-8 lead-in bytes 299 // "bring in" data bytes 300 for ($i = $pos + 1; $bc; $bc--, $i++) { 301 $ord = $ord << 6 | ord($str[$i]) & 63; 302 } 303 } 304 return $hex ? 'x' . dechex($ord) : $ord; 305 } 306} 307