1<?php 2// authormatcher.php -- HotCRP author matchers 3// Copyright (c) 2006-2018 Eddie Kohler; see LICENSE. 4 5class AuthorMatcher extends Author { 6 private $firstName_matcher; 7 private $lastName_matcher; 8 private $affiliation_matcher; 9 private $general_pregexes_; 10 11 private static $wordinfo; 12 13 function __construct($x = null) { 14 parent::__construct($x); 15 } 16 17 private function prepare() { 18 $any = []; 19 if ($this->firstName !== "") { 20 preg_match_all('/[a-z0-9]+/', $this->deaccent(0), $m); 21 $rr = []; 22 foreach ($m[0] as $w) { 23 $any[] = $rr[] = $w; 24 if (ctype_alpha($w[0])) { 25 if (strlen($w) === 1) 26 $any[] = $rr[] = $w . "[a-z]*"; 27 else 28 $any[] = $rr[] = $w[0] . "(?=\\.)"; 29 } 30 } 31 if (!empty($rr)) 32 $this->firstName_matcher = (object) [ 33 "preg_raw" => '\b(?:' . join("|", $rr) . ')\b', 34 "preg_utf8" => Text::UTF8_INITIAL_NONLETTERDIGIT . '(?:' . join("|", $rr) . ')' . Text::UTF8_FINAL_NONLETTERDIGIT 35 ]; 36 } 37 if ($this->lastName !== "") { 38 preg_match_all('/[a-z0-9]+/', $this->deaccent(1), $m); 39 $rr = $ur = []; 40 foreach ($m[0] as $w) { 41 $any[] = $w; 42 $rr[] = '(?=.*\b' . $w . '\b)'; 43 $ur[] = '(?=.*' . Text::UTF8_INITIAL_NONLETTERDIGIT . $w . Text::UTF8_FINAL_NONLETTERDIGIT . ')'; 44 } 45 if (!empty($rr)) 46 $this->lastName_matcher = (object) [ 47 "preg_raw" => '\A' . join("", $rr), 48 "preg_utf8" => '\A' . join("", $ur) 49 ]; 50 } 51 if ($this->affiliation !== "") { 52 $wordinfo = self::wordinfo(); 53 preg_match_all('/[a-z0-9&]+/', $this->deaccent(2), $m); 54 55 $directs = $alts = []; 56 $any_weak = false; 57 foreach ($m[0] as $w) { 58 $aw = get($wordinfo, $w); 59 if ($aw && isset($aw->stop) && $aw->stop) 60 continue; 61 $any[] = preg_quote($w); 62 $directs[] = $w; 63 if ($aw && isset($aw->weak) && $aw->weak) 64 $any_weak = true; 65 if ($aw && isset($aw->alternate)) { 66 if (is_array($aw->alternate)) 67 $alts = array_merge($alts, $aw->alternate); 68 else 69 $alts[] = $aw->alternate; 70 } 71 if ($aw && isset($aw->sync)) { 72 if (is_array($aw->sync)) 73 $alts = array_merge($alts, $aw->sync); 74 else 75 $alts[] = $aw->sync; 76 } 77 } 78 79 $rs = $directs; 80 foreach ($alts as $alt) { 81 if (is_object($alt)) { 82 if ((isset($alt->if) && !self::match_if($alt->if, $rs)) 83 || (isset($alt->if_not) && self::match_if($alt->if_not, $rs))) 84 continue; 85 $alt = $alt->word; 86 } 87 if (!is_string($alt)) 88 echo var_export($alt, true); 89 foreach (explode(" ", $alt) as $altw) 90 if ($altw !== "") { 91 $any[] = preg_quote($altw); 92 $rs[] = $altw; 93 $any_weak = true; 94 } 95 } 96 97 if (!empty($rs)) { 98 $rex = '{\b(?:' . str_replace('&', '\\&', join("|", $rs)) . ')\b}'; 99 $this->affiliation_matcher = [$directs, $any_weak, $rex]; 100 } 101 } 102 103 $content = join("|", $any); 104 if ($content !== "" && $content !== "none") { 105 $this->general_pregexes_ = (object) [ 106 "preg_raw" => '\b(?:' . $content . ')\b', 107 "preg_utf8" => Text::UTF8_INITIAL_NONLETTER . '(?:' . $content . ')' . Text::UTF8_FINAL_NONLETTER 108 ]; 109 } else 110 $this->general_pregexes_ = false; 111 } 112 113 function general_pregexes() { 114 if ($this->general_pregexes_ === null) 115 $this->prepare(); 116 return $this->general_pregexes_; 117 } 118 119 static function make($x, $nonauthor) { 120 if ($x !== "") { 121 $m = new AuthorMatcher($x); 122 if (!$m->is_empty()) { 123 $m->nonauthor = $nonauthor; 124 return $m; 125 } 126 } 127 return null; 128 } 129 static function make_string_guess($x) { 130 $m = new AuthorMatcher; 131 $m->assign_string_guess($x); 132 return $m; 133 } 134 static function make_affiliation($x) { 135 $m = new AuthorMatcher; 136 $m->affiliation = (string) $x; 137 return $m; 138 } 139 static function make_collaborator_line($x) { 140 if ($x === "" || strcasecmp($x, "none") === 0) 141 return null; 142 else { 143 $m = new AuthorMatcher; 144 $m->assign_string($x); 145 $m->nonauthor = true; 146 return $m; 147 } 148 } 149 150 const MATCH_NAME = 1; 151 const MATCH_AFFILIATION = 2; 152 function test($au, $prefer_name = false) { 153 if ($this->general_pregexes_ === null) 154 $this->prepare(); 155 if (!$this->general_pregexes_) 156 return false; 157 if (is_string($au)) 158 $au = Author::make_string_guess($au); 159 if ($this->lastName_matcher 160 && $au->lastName !== "" 161 && Text::match_pregexes($this->lastName_matcher, $au->lastName, $au->deaccent(1)) 162 && ($au->firstName === "" 163 || !$this->firstName_matcher 164 || Text::match_pregexes($this->firstName_matcher, $au->firstName, $au->deaccent(0)))) { 165 return self::MATCH_NAME; 166 } 167 if ($this->affiliation_matcher 168 && $au->affiliation !== "" 169 && (!$prefer_name || $this->lastName === "" || $au->lastName === "") 170 && $this->test_affiliation($au->deaccent(2))) { 171 return self::MATCH_AFFILIATION; 172 } 173 return false; 174 } 175 static function highlight_all($au, $matchers) { 176 $aff_suffix = null; 177 if (is_object($au)) { 178 if ($au->affiliation) 179 $aff_suffix = "(" . htmlspecialchars($au->affiliation) . ")"; 180 if ($au instanceof Contact) 181 $au = Text::name_text($au) . ($aff_suffix !== null ? " " . $aff_suffix : ""); 182 else 183 $au = $au->nameaff_text(); 184 } 185 $pregexes = []; 186 foreach ($matchers as $matcher) 187 $pregexes[] = $matcher->general_pregexes(); 188 if (count($pregexes) > 1) 189 $pregexes = [Text::merge_pregexes($pregexes)]; 190 if (!empty($pregexes)) 191 $au = Text::highlight($au, $pregexes[0]); 192 if ($aff_suffix && str_ends_with($au, $aff_suffix)) 193 $au = substr($au, 0, -strlen($aff_suffix)) 194 . '<span class="auaff">' . $aff_suffix . '</span>'; 195 return $au; 196 } 197 function highlight($au) { 198 return self::highlight_all($au, [$this]); 199 } 200 201 static function wordinfo() { 202 global $ConfSitePATH; 203 // XXX validate input JSON 204 if (self::$wordinfo === null) 205 self::$wordinfo = (array) json_decode(file_get_contents("$ConfSitePATH/etc/affiliationmatchers.json")); 206 return self::$wordinfo; 207 } 208 private function test_affiliation($mtext) { 209 list($am_words, $am_any_weak, $am_regex) = $this->affiliation_matcher; 210 if (!$am_any_weak) 211 return preg_match($am_regex, $mtext) === 1; 212 else if (!preg_match_all($am_regex, $mtext, $m)) 213 return false; 214 $result = true; 215 $wordinfo = self::wordinfo(); 216 foreach ($am_words as $w) { // $am_words contains no alternates 217 $aw = get($wordinfo, $w); 218 $weak = $aw && isset($aw->weak) && $aw->weak; 219 $saw_w = in_array($w, $m[0]); 220 if (!$saw_w && $aw && isset($aw->alternate)) { 221 // We didn't see a requested word; did we see one of its alternates? 222 foreach ($aw->alternate as $alt) { 223 if (is_object($alt)) { 224 if ((isset($alt->if) && !self::match_if($alt->if, $am_words)) 225 || (isset($alt->if_not) && self::match_if($alt->if_not, $am_words))) 226 continue; 227 $alt = $alt->word; 228 } 229 // Check for every word in the alternate list 230 $saw_w = true; 231 $altws = explode(" ", $alt); 232 foreach ($altws as $altw) 233 if ($altw !== "" && !in_array($altw, $m[0])) { 234 $saw_w = false; 235 break; 236 } 237 // If all are found, exit; check if the found alternate is strong 238 if ($saw_w) { 239 if ($weak && count($altws) == 1) { 240 $aw2 = get($wordinfo, $alt); 241 if (!$aw2 || !isset($aw2->weak) || !$aw2->weak) 242 $weak = false; 243 } 244 break; 245 } 246 } 247 } 248 // Check for sync words: e.g., "penn state university" ≠ 249 // "university penn". For each sync word string, if *any* sync word 250 // is in matcher, then *some* sync word must be in subject; 251 // otherwise *no* sync word allowed in subject. 252 if ($saw_w && $aw && isset($aw->sync) && $aw->sync !== "") { 253 $synclist = is_array($aw->sync) ? $aw->sync : [$aw->sync]; 254 foreach ($synclist as $syncws) { 255 $syncws = explode(" ", $syncws); 256 $has_any_syncs = false; 257 foreach ($syncws as $syncw) 258 if ($syncw !== "" && in_array($syncw, $am_words)) { 259 $has_any_syncs = true; 260 break; 261 } 262 if ($has_any_syncs) { 263 $saw_w = false; 264 foreach ($syncws as $syncw) 265 if ($syncw !== "" && in_array($syncw, $m[0])) { 266 $saw_w = true; 267 break; 268 } 269 } else { 270 $saw_w = true; 271 foreach ($syncws as $syncw) 272 if ($syncw !== "" && in_array($syncw, $m[0])) { 273 $saw_w = false; 274 break; 275 } 276 } 277 if (!$saw_w) 278 break; 279 } 280 } 281 if ($saw_w) { 282 if (!$weak) 283 return true; 284 } else 285 $result = false; 286 } 287 return $result; 288 } 289 private static function match_if($iftext, $ws) { 290 foreach (explode(" ", $iftext) as $w) 291 if ($w !== "" && !in_array($w, $ws)) 292 return false; 293 return true; 294 } 295 296 297 static function is_likely_affiliation($s, $default_name = false) { 298 preg_match_all('/[A-Za-z0-9&]+/', UnicodeHelper::deaccent($s), $m); 299 $has_weak = $has_nameish = false; 300 $wordinfo = self::wordinfo(); 301 $nw = count($m[0]); 302 $fc = null; 303 $nc = 0; 304 $ninit = 0; 305 foreach ($m[0] as $i => $w) { 306 $aw = get($wordinfo, strtolower($w)); 307 if ($aw) { 308 if (isset($aw->nameish)) { 309 if ($aw->nameish === false) 310 return true; 311 else if ($aw->nameish === 1) { 312 ++$ninit; 313 continue; 314 } else if ($aw->nameish === true 315 || ($aw->nameish === 2 && $i > 0)) { 316 $has_nameish = true; 317 continue; 318 } else if ($aw->nameish === 0) 319 continue; 320 } 321 if (isset($aw->weak) && $aw->weak) 322 $has_weak = true; 323 else 324 return true; 325 } else if (strlen($w) > 2 && ctype_upper($w)) { 326 if ($fc === null) 327 $fc = $i; 328 ++$nc; 329 } 330 } 331 return $has_weak 332 || ($nw === 1 && !$has_nameish && !$default_name) 333 || ($nw === 1 && ctype_upper($m[0][0])) 334 || ($ninit > 0 && $nw === $ninit) 335 || ($nc > 0 336 && !$has_nameish 337 && $fc !== 1 338 && ($nc < $nw || preg_match('{[-,/]}', $s))); 339 } 340 341 342 static function fix_collaborators($s, $type = 0) { 343 $s = cleannl($s); 344 345 // remove unicode versions 346 $x = ["“" => "\"", "”" => "\"", "–" => "-", "—" => "-", "•" => ";", 347 ".~" => ". ", "\\item" => "; "]; 348 $s = preg_replace_callback('/(?:“|”|–|—|•|\.\~|\\\\item)/', function ($m) use ($x) { 349 return $x[$m[0]]; 350 }, $s); 351 // remove numbers 352 $s = preg_replace('{^(?:\(?[1-9][0-9]*[.)][ \t]*|[-\*;\s]*[ \t]+' 353 . ($type === 1 ? '|[a-z][a-z]?\.[ \t]+(?=[A-Z])' : '') . ')}m', "", $s); 354 355 // separate multi-person lines 356 list($olines, $lines) = [explode("\n", $s), []]; 357 foreach ($olines as $line) { 358 $line = trim($line); 359 if (strlen($line) <= 35 360 || !self::fix_collaborators_split_line($line, $lines, count($olines), $type)) 361 $lines[] = $line; 362 } 363 364 list($olines, $lines) = [$lines, []]; 365 $any = false; 366 foreach ($olines as $line) { 367 // remove quotes 368 if (str_starts_with($line, "\"")) 369 $line = preg_replace_callback('{""?}', function ($m) { 370 return strlen($m[0]) === 1 ? "" : "\""; 371 }, $line); 372 // comments, trim punctuation 373 if ($line !== "") { 374 if ($line[0] === "#") { 375 $lines[] = $line; 376 continue; 377 } 378 $last_ch = $line[strlen($line) - 1]; 379 if ($last_ch === ":") { 380 $lines[] = "# " . $line; 381 continue; 382 } 383 } 384 // expand tab separation 385 if (strpos($line, "(") === false 386 && strpos($line, "\t") !== false) { 387 $ws = preg_split('/\t+/', $line); 388 $nw = count($ws); 389 if ($nw > 2 && strpos($ws[0], " ") === false) { 390 $name = rtrim($ws[0] . " " . $ws[1]); 391 $aff = rtrim($ws[2]); 392 $rest = rtrim(join(" ", array_slice($ws, 3))); 393 } else { 394 $name = $ws[0]; 395 $aff = rtrim($ws[1]); 396 $rest = rtrim(join(" ", array_slice($ws, 2))); 397 } 398 if ($rest !== "") 399 $rest = preg_replace('{\A[,\s]+}', "", $rest); 400 if ($aff !== "" && $aff[0] !== "(") 401 $aff = "($aff)"; 402 $line = $name; 403 if ($aff !== "") 404 $line .= ($line === "" ? "" : " ") . $aff; 405 if ($rest !== "") 406 $line .= ($line === "" ? "" : " - ") . $rest; 407 } 408 // simplify whitespace 409 $line = simplify_whitespace($line); 410 // apply parentheses 411 if (($paren = strpos($line, "(")) !== false) 412 $line = self::fix_collaborators_line_parens($line, $paren); 413 else 414 $line = self::fix_collaborators_line_no_parens($line); 415 // append line 416 if (!preg_match('{\A(?:none|n/a|na|-*|\.*)[\s,;.]*\z}i', $line)) 417 $lines[] = $line; 418 else if ($line !== "") 419 $any = true; 420 else if (!empty($lines)) 421 $lines[] = $line; 422 } 423 424 while (!empty($lines) && $lines[count($lines) - 1] === "") 425 array_pop($lines); 426 if (!empty($lines)) 427 return join("\n", $lines); 428 else if ($any) 429 return "None"; 430 else 431 return null; 432 } 433 static private function fix_collaborators_split_line($line, &$lines, $ntext, $type) { 434 // some assholes enter more than one per line 435 $ncomma = substr_count($line, ","); 436 $nparen = substr_count($line, "("); 437 $nsemi = substr_count($line, ";"); 438 if ($ncomma <= 2 && ($type === 0 || $nparen <= 1) && $nsemi <= 1) 439 return false; 440 if ($ncomma === 0 && $nsemi === 0 && $type === 1) { 441 $pairs = []; 442 while (($pos = strpos($line, "(")) !== false) { 443 $rpos = self::skip_balanced_parens($line, $pos); 444 $rpos = min($rpos + 1, strlen($line)); 445 if ((string) substr($line, $rpos, 2) === " -") 446 $rpos = strlen($line); 447 $pairs[] = trim(substr($line, 0, $rpos)); 448 $line = ltrim(substr($line, $rpos)); 449 } 450 if ($line !== "") 451 $pairs[] = $line; 452 if (count($pairs) <= 2) 453 return false; 454 else { 455 foreach ($pairs as $x) 456 $lines[] = $x; 457 return true; 458 } 459 } 460 $any = false; 461 while ($line !== "") { 462 if (str_starts_with($line, "\"")) { 463 preg_match('{\A"(?:[^"]|"")*(?:"|\z)([\s,;]*)}', $line, $m); 464 $skip = strlen($m[1]); 465 $pos = strlen($m[0]) - $skip; 466 $any = false; 467 } else { 468 $pos = $skip = 0; 469 $len = strlen($line); 470 while ($pos < $len) { 471 $last = $pos; 472 if (!preg_match('{\G([^,(;]*)([,(;])}', $line, $mm, 0, $pos)) { 473 $pos = $len; 474 break; 475 } 476 $pos += strlen($mm[1]); 477 if ($mm[2] === "(") { 478 $rpos = self::skip_balanced_parens($line, $pos); 479 $rpos = min($rpos + 1, $len); 480 if ($rpos + 2 < $len && substr($line, $rpos, 2) === " -") 481 $pos = $len; 482 else 483 $pos = $rpos; 484 } else if ($mm[2] === ";" || !$nsemi || $ncomma > $nsemi + 1) { 485 $skip = 1; 486 break; 487 } else { 488 ++$pos; 489 } 490 } 491 } 492 $w = substr($line, 0, $pos); 493 if ($nparen === 0 && $nsemi === 0 && $any 494 && self::is_likely_affiliation($w)) 495 $lines[count($lines) - 1] .= ", " . $w; 496 else { 497 $lines[] = ltrim($w); 498 $any = $any || strpos($w, "(") === false; 499 } 500 $line = (string) substr($line, $pos + $skip); 501 } 502 return true; 503 } 504 static private function fix_collaborators_line_no_parens($line) { 505 $line = str_replace(")", "", $line); 506 if (preg_match('{\A(|none|n/a|na|)\s*[.,;\}]?\z}i', $line, $m)) 507 return $m[1] === "" ? "" : "None"; 508 if (preg_match('{\A(.*?)(\s*)([-,;:\}])\s+(.*)\z}', $line, $m) 509 && ($m[2] !== "" || $m[3] !== "-")) { 510 if (strcasecmp($m[1], "institution") === 0 511 || strcasecmp($m[1], "all") === 0) 512 return "All ($m[4])"; 513 $sp1 = strpos($m[1], " "); 514 if (($m[3] !== "," || $sp1 !== false) 515 && !self::is_likely_affiliation($m[1])) 516 return "$m[1] ($m[4])"; 517 if ($sp1 === false 518 && $m[3] === "," 519 && ($sp4 = strpos($m[4], " ")) !== false 520 && self::is_likely_affiliation(substr($m[4], $sp4 + 1), true)) 521 return $m[1] . $m[2] . $m[3] . " " . substr($m[4], 0, $sp4) 522 . " (" . substr($m[4], $sp4 + 1) . ")"; 523 } 524 if (self::is_likely_affiliation($line)) 525 return "All ($line)"; 526 else 527 return $line; 528 } 529 static private function fix_collaborators_line_parens($line, $paren) { 530 $name = rtrim((string) substr($line, 0, $paren)); 531 if (preg_match('{\A(?:|-|all|any|institution|none)\s*[.,:;\}]?\z}i', $name)) { 532 $line = "All " . substr($line, $paren); 533 $paren = 4; 534 } 535 // match parentheses 536 $pos = $paren + 1; 537 $depth = 1; 538 $len = strlen($line); 539 if (strpos($line, ")", $pos) === $len - 1) { 540 $pos = $len; 541 $depth = 0; 542 } else { 543 while ($pos < $len && $depth) { 544 if ($line[$pos] === "(") 545 ++$depth; 546 else if ($line[$pos] === ")") 547 --$depth; 548 ++$pos; 549 } 550 } 551 while ($depth > 0) { 552 $line .= ")"; 553 ++$pos; 554 ++$len; 555 --$depth; 556 } 557 // check for abbreviation, e.g., "Massachusetts Institute of Tech (MIT)" 558 if ($pos === $len) { 559 $aff = substr($line, $paren + 1, $pos - $paren - 2); 560 if (ctype_upper($aff) 561 && ($aum = AuthorMatcher::make_affiliation($aff)) 562 && $aum->test(substr($line, 0, $paren))) 563 $line = "All (" . rtrim(substr($line, 0, $paren)) . ")"; 564 return $line; 565 } 566 // check for suffix 567 if (preg_match('{\G[-,:;.#()\s"]*\z}', $line, $m, 0, $pos)) 568 return substr($line, 0, $pos); 569 if (preg_match('{\G(\s*-+\s*|\s*[,:;.#%(\[\{]\s*|\s*(?=[a-z/\s]+\z))}', $line, $m, 0, $pos)) { 570 $suffix = substr($line, $pos + strlen($m[1])); 571 $line = substr($line, 0, $pos); 572 if ($suffix !== "") 573 $line .= " - " . $suffix; 574 return $line; 575 } 576 if (strpos($line, "(", $pos) === false) { 577 if (preg_match('{\G([^,;]+)[,;]\s*(\S.+)\z}', $line, $m, 0, $pos)) 578 $line = substr($line, 0, $pos) . $m[1] . " (" . $m[2] . ")"; 579 else 580 $line .= " (unknown)"; 581 } 582 return $line; 583 } 584 585 static function trim_collaborators($s) { 586 return preg_replace('{\s*#.*$|\ANone\z}im', "", $s); 587 } 588} 589