1<?php 2// abbreviationmatcher.php -- HotCRP abbreviation matcher helper class 3// Copyright (c) 2006-2018 Eddie Kohler; see LICENSE. 4 5// Match priority (higher = more priority): 6// 5. Exact match 7// 4. Exact match with [-_.–—] replaced by spaces 8// 3. Case-insensitive match with [-_.–—] replaced by spaces 9// 2. Case-insensitive word match with [-_.–—] replaced by spaces 10// 1. Case-insensitive CamelCase match with [-_.–—] replaced by spaces 11// If a word match is performed, prefer matches that match more complete words. 12// Words must appear in order, so pattern "hello kitty" does not match "kitty 13// hello". 14// If the pattern has no Unicode characters, these steps are performed against 15// the deaccented subject (so subjects “élan” and “elan” match pattern “elan” 16// with the same priority). If the pattern has Unicode characters, then exact 17// matches take priority over deaccented matches (so subject “élan” is a higher 18// priority match for pattern “élan”). 19 20class AbbreviationMatchTracker { 21 private $isu; 22 private $pattern; 23 private $dpattern; 24 private $upattern; 25 private $dupattern; 26 private $imatchre; 27 private $is_camel_word; 28 private $has_star; 29 private $camelwords; 30 private $mclass = 1; 31 private $matches = []; 32 33 function __construct($pattern, $isu = null) { 34 if ($isu === null) 35 $isu = !!preg_match('/[\x80-\xFF]/', $pattern); 36 $this->isu = $isu; 37 if ($isu) { 38 $this->pattern = UnicodeHelper::normalize($pattern); 39 $this->dpattern = AbbreviationMatcher::dedash($this->pattern); 40 $this->upattern = UnicodeHelper::deaccent($this->pattern); 41 $this->dupattern = AbbreviationMatcher::dedash($this->upattern); 42 } else { 43 $this->pattern = $this->upattern = $pattern; 44 $this->dpattern = $this->dupattern = AbbreviationMatcher::dedash($pattern); 45 } 46 $this->is_camel_word = AbbreviationMatcher::is_camel_word($pattern); 47 $this->has_star = strpos($pattern, "*") !== false; 48 } 49 private function wmatch_score($pattern, $subject, $flags) { 50 // assert($pattern whitespace is simplified) 51 $pwords = explode(" ", $pattern); 52 $swords = preg_split('{\s+}', $subject); 53 $ppos = $spos = $demerits = 0; 54 $pword = null; 55 $pword_pos = -1; 56 $pword_star = false; 57 while (isset($pwords[$ppos]) && isset($swords[$spos])) { 58 if ($pword_pos !== $ppos) { 59 $pword = '{\A' . preg_quote($pwords[$ppos]) . '(\S*)\z}' . $flags; 60 $pword_pos = $ppos; 61 if ($this->has_star && strpos($pwords[$ppos], "*") !== false) { 62 $pword = str_replace('\\*', '.*', $pword); 63 $pword_star = true; 64 } else 65 $pword_star = false; 66 } 67 if (preg_match($pword, $swords[$spos], $m)) { 68 ++$ppos; 69 $demerits += ($m[1] !== "" || $pword_star ? 1 : 0); 70 } else 71 $demerits += 1; 72 ++$spos; 73 } 74 // missed words cost 1/64 point, partial words cost 1/64 point 75 if (!isset($pwords[$ppos])) { 76 if (!$this->has_star) 77 $demerits += count($swords) - $spos; 78 return 1 - 0.015625 * max(min($demerits, 63), 1); 79 } else 80 return 0; 81 } 82 private function camel_wmatch_score($subject) { 83 assert($this->is_camel_word); 84 if (!$this->camelwords) { 85 $this->camelwords = []; 86 $x = $this->pattern; 87 while (preg_match('{\A[-_.]*([a-z]+|[A-Z][a-z]*|[0-9]+)(.*)\z}', $x, $m)) { 88 $this->camelwords[] = $m[1]; 89 $this->camelwords[] = $m[2] !== "" && ctype_alnum(substr($m[2], 0, 1)); 90 $x = $m[2]; 91 } 92 } 93 $swords = preg_split('{\s+}', $subject); 94 $ppos = $spos = $demerits = 0; 95 while (isset($this->camelwords[$ppos]) && isset($swords[$spos])) { 96 $pword = $this->camelwords[$ppos]; 97 $sword = $swords[$spos]; 98 $ppos1 = $ppos; 99 $sidx = 0; 100 while ($sidx + strlen($pword) <= strlen($sword) 101 && strcasecmp($pword, substr($sword, $sidx, strlen($pword))) === 0) { 102 $sidx += strlen($pword); 103 $ppos += 2; 104 if (!$this->camelwords[$ppos - 1]) 105 break; 106 $pword = $this->camelwords[$ppos]; 107 } 108 $demerits += ($sidx < strlen($sword) ? 1 : 0); 109 ++$spos; 110 } 111 if (!isset($this->camelwords[$ppos])) { 112 if (!$this->has_star) 113 $demerits += count($swords) - $spos; 114 return 1 - 0.015625 * max(min($demerits, 63), 1); 115 } else 116 return 0; 117 } 118 private function mclass($subject, $sisu = null) { 119 if ($sisu === null) 120 $sisu = !!preg_match('/[\x80-\xFF]/', $subject); 121 122 if ($this->isu && $sisu) { 123 if ($this->pattern === $subject) 124 return 9; 125 126 if ($this->mclass < 9) { 127 $dsubject = AbbreviationMatcher::dedash($subject); 128 if ($this->dpattern === $dsubject) 129 return 8; 130 } 131 132 if ($this->mclass < 7) { 133 if (!$this->imatchre) 134 $this->imatchre = '{\A' . preg_quote($this->dpattern) . '\z}iu'; 135 if (preg_match($this->imatchre, $dsubject)) 136 return 7; 137 } 138 139 if ($this->mclass < 7) { 140 $s = $this->wmatch_score($this->dpattern, $dsubject, "iu"); 141 if ($s) 142 return 6 + $s; 143 } 144 } 145 146 if ($this->mclass < 6) { 147 $usubject = $sisu ? UnicodeHelper::deaccent($subject) : $subject; 148 if ($this->upattern === $usubject) 149 return 5; 150 } 151 152 if ($this->mclass < 5) { 153 $dusubject = AbbreviationMatcher::dedash($usubject); 154 if ($this->dupattern === $dusubject) 155 return 4; 156 } 157 158 if ($this->mclass < 4) { 159 if (strcasecmp($this->dupattern, $dusubject) === 0) 160 return 3; 161 } 162 163 if ($this->mclass < 3) { 164 $s = $this->wmatch_score($this->dupattern, $dusubject, "i"); 165 if ($s) 166 return 2 + $s; 167 } 168 169 if ($this->mclass < 2 && $this->is_camel_word) { 170 $s = $this->camel_wmatch_score($dusubject); 171 if ($s) 172 return 1 + $s; 173 } 174 175 return 0; 176 } 177 178 function check($subject, $data, $sisu = null) { 179 $mclass = $this->mclass($subject, $sisu); 180 if ($mclass > $this->mclass) { 181 $this->mclass = $mclass; 182 $this->matches = [$data]; 183 } else if ($mclass == $this->mclass 184 && $this->matches[count($this->matches) - 1] !== $data) 185 $this->matches[] = $data; 186 } 187 188 function matches() { 189 return $this->matches; 190 } 191} 192 193class AbbreviationClass { 194 const TYPE_CAMELCASE = 0; 195 const TYPE_LOWERDASH = 1; 196 public $type = self::TYPE_CAMELCASE; 197 public $drop_parens = true; 198 public $nwords = 3; 199 public $stopwords = ""; 200 public $tflags = 0; 201 public $index = 0; 202 203 function step() { 204 ++$this->index; 205 if ($this->index >= 1) 206 $this->drop_parens = false; 207 if ($this->index >= 2) 208 $this->stopwords = false; 209 if ($this->index > $this->nwords) 210 $this->nwords = $this->index; 211 return $this->index <= 5; 212 } 213} 214 215class AbbreviationMatcher { 216 private $data = []; 217 private $nanal = 0; 218 private $matches = []; 219 private $abbreviators = []; 220 221 function add($name, $data, $tflags = 0, $prio = 0) { 222 $this->data[] = [$name, null, $data, $tflags, $prio]; 223 $this->matches = []; 224 } 225 226 function set_abbreviator($tflags, Abbreviator $abbreviator) { 227 $this->abbreviators[$tflags] = $abbreviator; 228 } 229 230 static function dedash($text) { 231 return preg_replace('{(?:[-_.\s]|–|—)+}', " ", $text); 232 } 233 static function is_camel_word($text) { 234 return preg_match('{\A[-_.A-Za-z0-9]*(?:[A-Za-z](?=[-_.A-Z0-9])|[0-9](?=[-_.A-Za-z]))[-_.A-Za-z0-9]*\*?\z}', $text); 235 } 236 237 private function _analyze() { 238 while ($this->nanal < count($this->data)) { 239 $name = $uname = simplify_whitespace($this->data[$this->nanal][0]); 240 if (preg_match('/[\x80-\xFF]/', $name)) { 241 $name = UnicodeHelper::normalize($name); 242 $uname = UnicodeHelper::deaccent($name); 243 } 244 $this->data[$this->nanal][0] = $name; 245 $this->data[$this->nanal][1] = self::dedash($uname); 246 ++$this->nanal; 247 } 248 } 249 250 private function _find_all($pattern) { 251 if (empty($this->matches)) 252 $this->_analyze(); 253 // A call to Abbreviator::abbreviation_for() might call back in 254 // to AbbreviationMatcher::find_all(). Short-circuit that call. 255 $this->matches[$pattern] = []; 256 257 $spat = $upat = simplify_whitespace($pattern); 258 if (($sisu = !!preg_match('/[\x80-\xFF]/', $spat))) { 259 $spat = UnicodeHelper::normalize($spat); 260 $upat = UnicodeHelper::deaccent($spat); 261 } 262 $dupat = self::dedash($upat); 263 if (self::is_camel_word($upat)) { 264 $re = preg_replace('{([A-Za-z](?=[A-Z0-9 ])|[0-9](?=[A-Za-z ]))}', '$1(?:|.*\b)', $dupat); 265 $re = '{\b' . str_replace(" ", "", $re) . '}i'; 266 } else { 267 $re = join('.*\b', preg_split('{[^A-Za-z0-9*]+}', $dupat)); 268 $re = '{\b' . str_replace("*", ".*", $re) . '}i'; 269 } 270 271 $mclass = 0; 272 $matches = []; 273 foreach ($this->data as $i => $d) { 274 if (strcasecmp($dupat, $d[1]) === 0) { 275 if ($mclass === 0) 276 $matches = []; 277 $mclass = 1; 278 $matches[] = $i; 279 } else if ($mclass === 0 && preg_match($re, $d[1])) 280 $matches[] = $i; 281 } 282 283 if (count($matches) > 1) { 284 $amt = new AbbreviationMatchTracker($spat, $sisu); 285 foreach ($matches as $i) { 286 $d = $this->data[$i]; 287 $amt->check($d[0], $i, strlen($d[0]) !== strlen($d[1])); 288 } 289 $matches = $amt->matches(); 290 } 291 292 if (empty($matches)) { 293 $last_abbreviator = $last_value = null; 294 $amt = new AbbreviationMatchTracker($spat, $sisu); 295 foreach ($this->data as $i => $d) { 296 if ($d[2] instanceof Abbreviator) 297 $abbreviator = $d[2]; 298 else if (isset($this->abbreviators[$d[3]])) 299 $abbreviator = $this->abbreviators[$d[3]]; 300 else 301 continue; 302 if ($last_abbreviator === $abbreviator && $last_value === $d[2]) 303 continue; 304 $last_abbreviator = $abbreviator; 305 $last_value = $d[2]; 306 if (($abbrs = $abbreviator->abbreviations_for($d[0], $d[2]))) 307 foreach (is_string($abbrs) ? [$abbrs] : $abbrs as $abbr) 308 $amt->check($abbr, $i); 309 } 310 $matches = $amt->matches(); 311 } 312 313 $this->matches[$pattern] = $matches; 314 } 315 316 function find_all($pattern, $tflags = 0) { 317 if (!array_key_exists($pattern, $this->matches)) 318 $this->_find_all($pattern); 319 $results = []; 320 $last = $prio = false; 321 foreach ($this->matches[$pattern] as $i) { 322 $d = $this->data[$i]; 323 if (!$tflags || ($d[3] & $tflags) != 0) { 324 if ($prio === false || $d[4] > $prio) { 325 $results = []; 326 $prio = $d[4]; 327 } 328 if (empty($results) || $d[2] !== $last) 329 $results[] = $last = $d[2]; 330 } 331 } 332 return $results; 333 } 334 335 function find1($pattern, $tflags = 0) { 336 $a = $this->find_all($pattern, $tflags); 337 if (empty($a)) 338 return false; 339 else if (count($a) == 1) 340 return $a[0]; 341 else 342 return null; 343 } 344 345 346 function unique_abbreviation($name, $data, AbbreviationClass $aclass) { 347 $last = $aclass_clone = null; 348 while (true) { 349 $x = self::make_abbreviation($name, $aclass); 350 if ($last !== $x) { 351 $last = $x; 352 $a = $this->find_all($x, $aclass->tflags); 353 if (count($a) === 1 && $a[0] === $data) 354 return $x; 355 } 356 if (!$aclass_clone) 357 $aclass = $aclass_clone = clone $aclass; 358 if (!$aclass->step()) 359 return null; 360 } 361 } 362 363 static function make_abbreviation($name, AbbreviationClass $aclass) { 364 $name = str_replace("'", "", $name); 365 // try to filter out noninteresting words 366 if ($aclass->stopwords !== false) { 367 $stopwords = (string) $aclass->stopwords; 368 if ($stopwords !== "") 369 $stopwords .= "|"; 370 $xname = preg_replace('/\b(?:' . $stopwords . 'a|an|and|be|did|do|for|in|of|or|the|their|they|this|to|with|you)\b/i', '', $name); 371 $name = $xname ? : $name; 372 } 373 // drop parenthetical remarks 374 if ($aclass->drop_parens) 375 $name = preg_replace('/\(.*?\)|\[.*?\]/', ' ', $name); 376 // drop unlikely punctuation 377 $xname = preg_replace('/[-:\s+,.?!()\[\]\{\}_\/\"]+/', " ", " $name "); 378 // drop extraneous words 379 $xname = preg_replace('/\A(' . str_repeat(' \S+', $aclass->nwords) . ' ).*\z/', '$1', $xname); 380 if ($aclass->type === AbbreviationClass::TYPE_CAMELCASE) { 381 $xname = str_replace(" ", "", ucwords($xname)); 382 return preg_replace('/([A-Z][a-z][a-z])[a-z]*/', '$1', $xname); 383 } else 384 return strtolower(str_replace(" ", "-", trim($xname))); 385 } 386} 387