1<?php
2// authormatcher.php -- HotCRP author matchers
3// Copyright (c) 2006-2018 Eddie Kohler; see LICENSE.
4
5class AuthorMatcher extends Author {
6    private $firstName_matcher;
7    private $lastName_matcher;
8    private $affiliation_matcher;
9    private $general_pregexes_;
10
11    private static $wordinfo;
12
13    function __construct($x = null) {
14        parent::__construct($x);
15    }
16
17    private function prepare() {
18        $any = [];
19        if ($this->firstName !== "") {
20            preg_match_all('/[a-z0-9]+/', $this->deaccent(0), $m);
21            $rr = [];
22            foreach ($m[0] as $w) {
23                $any[] = $rr[] = $w;
24                if (ctype_alpha($w[0])) {
25                    if (strlen($w) === 1)
26                        $any[] = $rr[] = $w . "[a-z]*";
27                    else
28                        $any[] = $rr[] = $w[0] . "(?=\\.)";
29                }
30            }
31            if (!empty($rr))
32                $this->firstName_matcher = (object) [
33                    "preg_raw" => '\b(?:' . join("|", $rr) . ')\b',
34                    "preg_utf8" => Text::UTF8_INITIAL_NONLETTERDIGIT . '(?:' . join("|", $rr) . ')' . Text::UTF8_FINAL_NONLETTERDIGIT
35                ];
36        }
37        if ($this->lastName !== "") {
38            preg_match_all('/[a-z0-9]+/', $this->deaccent(1), $m);
39            $rr = $ur = [];
40            foreach ($m[0] as $w) {
41                $any[] = $w;
42                $rr[] = '(?=.*\b' . $w . '\b)';
43                $ur[] = '(?=.*' . Text::UTF8_INITIAL_NONLETTERDIGIT . $w . Text::UTF8_FINAL_NONLETTERDIGIT . ')';
44            }
45            if (!empty($rr))
46                $this->lastName_matcher = (object) [
47                    "preg_raw" => '\A' . join("", $rr),
48                    "preg_utf8" => '\A' . join("", $ur)
49                ];
50        }
51        if ($this->affiliation !== "") {
52            $wordinfo = self::wordinfo();
53            preg_match_all('/[a-z0-9&]+/', $this->deaccent(2), $m);
54
55            $directs = $alts = [];
56            $any_weak = false;
57            foreach ($m[0] as $w) {
58                $aw = get($wordinfo, $w);
59                if ($aw && isset($aw->stop) && $aw->stop)
60                    continue;
61                $any[] = preg_quote($w);
62                $directs[] = $w;
63                if ($aw && isset($aw->weak) && $aw->weak)
64                    $any_weak = true;
65                if ($aw && isset($aw->alternate)) {
66                    if (is_array($aw->alternate))
67                        $alts = array_merge($alts, $aw->alternate);
68                    else
69                        $alts[] = $aw->alternate;
70                }
71                if ($aw && isset($aw->sync)) {
72                    if (is_array($aw->sync))
73                        $alts = array_merge($alts, $aw->sync);
74                    else
75                        $alts[] = $aw->sync;
76                }
77            }
78
79            $rs = $directs;
80            foreach ($alts as $alt) {
81                if (is_object($alt)) {
82                    if ((isset($alt->if) && !self::match_if($alt->if, $rs))
83                        || (isset($alt->if_not) && self::match_if($alt->if_not, $rs)))
84                        continue;
85                    $alt = $alt->word;
86                }
87                if (!is_string($alt))
88                    echo var_export($alt, true);
89                foreach (explode(" ", $alt) as $altw)
90                    if ($altw !== "") {
91                        $any[] = preg_quote($altw);
92                        $rs[] = $altw;
93                        $any_weak = true;
94                    }
95            }
96
97            if (!empty($rs)) {
98                $rex = '{\b(?:' . str_replace('&', '\\&', join("|", $rs)) . ')\b}';
99                $this->affiliation_matcher = [$directs, $any_weak, $rex];
100            }
101        }
102
103        $content = join("|", $any);
104        if ($content !== "" && $content !== "none") {
105            $this->general_pregexes_ = (object) [
106                "preg_raw" => '\b(?:' . $content . ')\b',
107                "preg_utf8" => Text::UTF8_INITIAL_NONLETTER . '(?:' . $content . ')' . Text::UTF8_FINAL_NONLETTER
108            ];
109        } else
110            $this->general_pregexes_ = false;
111    }
112
113    function general_pregexes() {
114        if ($this->general_pregexes_ === null)
115            $this->prepare();
116        return $this->general_pregexes_;
117    }
118
119    static function make($x, $nonauthor) {
120        if ($x !== "") {
121            $m = new AuthorMatcher($x);
122            if (!$m->is_empty()) {
123                $m->nonauthor = $nonauthor;
124                return $m;
125            }
126        }
127        return null;
128    }
129    static function make_string_guess($x) {
130        $m = new AuthorMatcher;
131        $m->assign_string_guess($x);
132        return $m;
133    }
134    static function make_affiliation($x) {
135        $m = new AuthorMatcher;
136        $m->affiliation = (string) $x;
137        return $m;
138    }
139    static function make_collaborator_line($x) {
140        if ($x === "" || strcasecmp($x, "none") === 0)
141            return null;
142        else {
143            $m = new AuthorMatcher;
144            $m->assign_string($x);
145            $m->nonauthor = true;
146            return $m;
147        }
148    }
149
150    const MATCH_NAME = 1;
151    const MATCH_AFFILIATION = 2;
152    function test($au, $prefer_name = false) {
153        if ($this->general_pregexes_ === null)
154            $this->prepare();
155        if (!$this->general_pregexes_)
156            return false;
157        if (is_string($au))
158            $au = Author::make_string_guess($au);
159        if ($this->lastName_matcher
160            && $au->lastName !== ""
161            && Text::match_pregexes($this->lastName_matcher, $au->lastName, $au->deaccent(1))
162            && ($au->firstName === ""
163                || !$this->firstName_matcher
164                || Text::match_pregexes($this->firstName_matcher, $au->firstName, $au->deaccent(0)))) {
165            return self::MATCH_NAME;
166        }
167        if ($this->affiliation_matcher
168            && $au->affiliation !== ""
169            && (!$prefer_name || $this->lastName === "" || $au->lastName === "")
170            && $this->test_affiliation($au->deaccent(2))) {
171            return self::MATCH_AFFILIATION;
172        }
173        return false;
174    }
175    static function highlight_all($au, $matchers) {
176        $aff_suffix = null;
177        if (is_object($au)) {
178            if ($au->affiliation)
179                $aff_suffix = "(" . htmlspecialchars($au->affiliation) . ")";
180            if ($au instanceof Contact)
181                $au = Text::name_text($au) . ($aff_suffix !== null ? " " . $aff_suffix : "");
182            else
183                $au = $au->nameaff_text();
184        }
185        $pregexes = [];
186        foreach ($matchers as $matcher)
187            $pregexes[] = $matcher->general_pregexes();
188        if (count($pregexes) > 1)
189            $pregexes = [Text::merge_pregexes($pregexes)];
190        if (!empty($pregexes))
191            $au = Text::highlight($au, $pregexes[0]);
192        if ($aff_suffix && str_ends_with($au, $aff_suffix))
193            $au = substr($au, 0, -strlen($aff_suffix))
194                . '<span class="auaff">' . $aff_suffix . '</span>';
195        return $au;
196    }
197    function highlight($au) {
198        return self::highlight_all($au, [$this]);
199    }
200
201    static function wordinfo() {
202        global $ConfSitePATH;
203        // XXX validate input JSON
204        if (self::$wordinfo === null)
205            self::$wordinfo = (array) json_decode(file_get_contents("$ConfSitePATH/etc/affiliationmatchers.json"));
206        return self::$wordinfo;
207    }
208    private function test_affiliation($mtext) {
209        list($am_words, $am_any_weak, $am_regex) = $this->affiliation_matcher;
210        if (!$am_any_weak)
211            return preg_match($am_regex, $mtext) === 1;
212        else if (!preg_match_all($am_regex, $mtext, $m))
213            return false;
214        $result = true;
215        $wordinfo = self::wordinfo();
216        foreach ($am_words as $w) { // $am_words contains no alternates
217            $aw = get($wordinfo, $w);
218            $weak = $aw && isset($aw->weak) && $aw->weak;
219            $saw_w = in_array($w, $m[0]);
220            if (!$saw_w && $aw && isset($aw->alternate)) {
221                // We didn't see a requested word; did we see one of its alternates?
222                foreach ($aw->alternate as $alt) {
223                    if (is_object($alt)) {
224                        if ((isset($alt->if) && !self::match_if($alt->if, $am_words))
225                            || (isset($alt->if_not) && self::match_if($alt->if_not, $am_words)))
226                            continue;
227                        $alt = $alt->word;
228                    }
229                    // Check for every word in the alternate list
230                    $saw_w = true;
231                    $altws = explode(" ", $alt);
232                    foreach ($altws as $altw)
233                        if ($altw !== "" && !in_array($altw, $m[0])) {
234                            $saw_w = false;
235                            break;
236                        }
237                    // If all are found, exit; check if the found alternate is strong
238                    if ($saw_w) {
239                        if ($weak && count($altws) == 1) {
240                            $aw2 = get($wordinfo, $alt);
241                            if (!$aw2 || !isset($aw2->weak) || !$aw2->weak)
242                                $weak = false;
243                        }
244                        break;
245                    }
246                }
247            }
248            // Check for sync words: e.g., "penn state university" ≠
249            // "university penn". For each sync word string, if *any* sync word
250            // is in matcher, then *some* sync word must be in subject;
251            // otherwise *no* sync word allowed in subject.
252            if ($saw_w && $aw && isset($aw->sync) && $aw->sync !== "") {
253                $synclist = is_array($aw->sync) ? $aw->sync : [$aw->sync];
254                foreach ($synclist as $syncws) {
255                    $syncws = explode(" ", $syncws);
256                    $has_any_syncs = false;
257                    foreach ($syncws as $syncw)
258                        if ($syncw !== "" && in_array($syncw, $am_words)) {
259                            $has_any_syncs = true;
260                            break;
261                        }
262                    if ($has_any_syncs) {
263                        $saw_w = false;
264                        foreach ($syncws as $syncw)
265                            if ($syncw !== "" && in_array($syncw, $m[0])) {
266                                $saw_w = true;
267                                break;
268                            }
269                    } else {
270                        $saw_w = true;
271                        foreach ($syncws as $syncw)
272                            if ($syncw !== "" && in_array($syncw, $m[0])) {
273                                $saw_w = false;
274                                break;
275                            }
276                    }
277                    if (!$saw_w)
278                        break;
279                }
280            }
281            if ($saw_w) {
282                if (!$weak)
283                    return true;
284            } else
285                $result = false;
286        }
287        return $result;
288    }
289    private static function match_if($iftext, $ws) {
290        foreach (explode(" ", $iftext) as $w)
291            if ($w !== "" && !in_array($w, $ws))
292                return false;
293        return true;
294    }
295
296
297    static function is_likely_affiliation($s, $default_name = false) {
298        preg_match_all('/[A-Za-z0-9&]+/', UnicodeHelper::deaccent($s), $m);
299        $has_weak = $has_nameish = false;
300        $wordinfo = self::wordinfo();
301        $nw = count($m[0]);
302        $fc = null;
303        $nc = 0;
304        $ninit = 0;
305        foreach ($m[0] as $i => $w) {
306            $aw = get($wordinfo, strtolower($w));
307            if ($aw) {
308                if (isset($aw->nameish)) {
309                    if ($aw->nameish === false)
310                        return true;
311                    else if ($aw->nameish === 1) {
312                        ++$ninit;
313                        continue;
314                    } else if ($aw->nameish === true
315                               || ($aw->nameish === 2 && $i > 0)) {
316                        $has_nameish = true;
317                        continue;
318                    } else if ($aw->nameish === 0)
319                        continue;
320                }
321                if (isset($aw->weak) && $aw->weak)
322                    $has_weak = true;
323                else
324                    return true;
325            } else if (strlen($w) > 2 && ctype_upper($w)) {
326                if ($fc === null)
327                    $fc = $i;
328                ++$nc;
329            }
330        }
331        return $has_weak
332            || ($nw === 1 && !$has_nameish && !$default_name)
333            || ($nw === 1 && ctype_upper($m[0][0]))
334            || ($ninit > 0 && $nw === $ninit)
335            || ($nc > 0
336                && !$has_nameish
337                && $fc !== 1
338                && ($nc < $nw || preg_match('{[-,/]}', $s)));
339    }
340
341
342    static function fix_collaborators($s, $type = 0) {
343        $s = cleannl($s);
344
345        // remove unicode versions
346        $x = ["“" => "\"", "”" => "\"", "–" => "-", "—" => "-", "•" => ";",
347              ".~" => ". ", "\\item" => "; "];
348        $s = preg_replace_callback('/(?:“|”|–|—|•|\.\~|\\\\item)/', function ($m) use ($x) {
349            return $x[$m[0]];
350        }, $s);
351        // remove numbers
352        $s = preg_replace('{^(?:\(?[1-9][0-9]*[.)][ \t]*|[-\*;\s]*[ \t]+'
353                . ($type === 1 ? '|[a-z][a-z]?\.[ \t]+(?=[A-Z])' : '') . ')}m', "", $s);
354
355        // separate multi-person lines
356        list($olines, $lines) = [explode("\n", $s), []];
357        foreach ($olines as $line) {
358            $line = trim($line);
359            if (strlen($line) <= 35
360                || !self::fix_collaborators_split_line($line, $lines, count($olines), $type))
361                $lines[] = $line;
362        }
363
364        list($olines, $lines) = [$lines, []];
365        $any = false;
366        foreach ($olines as $line) {
367            // remove quotes
368            if (str_starts_with($line, "\""))
369                $line = preg_replace_callback('{""?}', function ($m) {
370                    return strlen($m[0]) === 1 ? "" : "\"";
371                }, $line);
372            // comments, trim punctuation
373            if ($line !== "") {
374                if ($line[0] === "#") {
375                    $lines[] = $line;
376                    continue;
377                }
378                $last_ch = $line[strlen($line) - 1];
379                if ($last_ch === ":") {
380                    $lines[] = "# " . $line;
381                    continue;
382                }
383            }
384            // expand tab separation
385            if (strpos($line, "(") === false
386                && strpos($line, "\t") !== false) {
387                $ws = preg_split('/\t+/', $line);
388                $nw = count($ws);
389                if ($nw > 2 && strpos($ws[0], " ") === false) {
390                    $name = rtrim($ws[0] . " " . $ws[1]);
391                    $aff = rtrim($ws[2]);
392                    $rest = rtrim(join(" ", array_slice($ws, 3)));
393                } else {
394                    $name = $ws[0];
395                    $aff = rtrim($ws[1]);
396                    $rest = rtrim(join(" ", array_slice($ws, 2)));
397                }
398                if ($rest !== "")
399                    $rest = preg_replace('{\A[,\s]+}', "", $rest);
400                if ($aff !== "" && $aff[0] !== "(")
401                    $aff = "($aff)";
402                $line = $name;
403                if ($aff !== "")
404                    $line .= ($line === "" ? "" : " ") . $aff;
405                if ($rest !== "")
406                    $line .= ($line === "" ? "" : " - ") . $rest;
407            }
408            // simplify whitespace
409            $line = simplify_whitespace($line);
410            // apply parentheses
411            if (($paren = strpos($line, "(")) !== false)
412                $line = self::fix_collaborators_line_parens($line, $paren);
413            else
414                $line = self::fix_collaborators_line_no_parens($line);
415            // append line
416            if (!preg_match('{\A(?:none|n/a|na|-*|\.*)[\s,;.]*\z}i', $line))
417                $lines[] = $line;
418            else if ($line !== "")
419                $any = true;
420            else if (!empty($lines))
421                $lines[] = $line;
422        }
423
424        while (!empty($lines) && $lines[count($lines) - 1] === "")
425            array_pop($lines);
426        if (!empty($lines))
427            return join("\n", $lines);
428        else if ($any)
429            return "None";
430        else
431            return null;
432    }
433    static private function fix_collaborators_split_line($line, &$lines, $ntext, $type) {
434        // some assholes enter more than one per line
435        $ncomma = substr_count($line, ",");
436        $nparen = substr_count($line, "(");
437        $nsemi = substr_count($line, ";");
438        if ($ncomma <= 2 && ($type === 0 || $nparen <= 1) && $nsemi <= 1)
439            return false;
440        if ($ncomma === 0 && $nsemi === 0 && $type === 1) {
441            $pairs = [];
442            while (($pos = strpos($line, "(")) !== false) {
443                $rpos = self::skip_balanced_parens($line, $pos);
444                $rpos = min($rpos + 1, strlen($line));
445                if ((string) substr($line, $rpos, 2) === " -")
446                    $rpos = strlen($line);
447                $pairs[] = trim(substr($line, 0, $rpos));
448                $line = ltrim(substr($line, $rpos));
449            }
450            if ($line !== "")
451                $pairs[] = $line;
452            if (count($pairs) <= 2)
453                return false;
454            else {
455                foreach ($pairs as $x)
456                    $lines[] = $x;
457                return true;
458            }
459        }
460        $any = false;
461        while ($line !== "") {
462            if (str_starts_with($line, "\"")) {
463                preg_match('{\A"(?:[^"]|"")*(?:"|\z)([\s,;]*)}', $line, $m);
464                $skip = strlen($m[1]);
465                $pos = strlen($m[0]) - $skip;
466                $any = false;
467            } else {
468                $pos = $skip = 0;
469                $len = strlen($line);
470                while ($pos < $len) {
471                    $last = $pos;
472                    if (!preg_match('{\G([^,(;]*)([,(;])}', $line, $mm, 0, $pos)) {
473                        $pos = $len;
474                        break;
475                    }
476                    $pos += strlen($mm[1]);
477                    if ($mm[2] === "(") {
478                        $rpos = self::skip_balanced_parens($line, $pos);
479                        $rpos = min($rpos + 1, $len);
480                        if ($rpos + 2 < $len && substr($line, $rpos, 2) === " -")
481                            $pos = $len;
482                        else
483                            $pos = $rpos;
484                    } else if ($mm[2] === ";" || !$nsemi || $ncomma > $nsemi + 1) {
485                        $skip = 1;
486                        break;
487                    } else {
488                        ++$pos;
489                    }
490                }
491            }
492            $w = substr($line, 0, $pos);
493            if ($nparen === 0 && $nsemi === 0 && $any
494                && self::is_likely_affiliation($w))
495                $lines[count($lines) - 1] .= ", " . $w;
496            else {
497                $lines[] = ltrim($w);
498                $any = $any || strpos($w, "(") === false;
499            }
500            $line = (string) substr($line, $pos + $skip);
501        }
502        return true;
503    }
504    static private function fix_collaborators_line_no_parens($line) {
505        $line = str_replace(")", "", $line);
506        if (preg_match('{\A(|none|n/a|na|)\s*[.,;\}]?\z}i', $line, $m))
507            return $m[1] === "" ? "" : "None";
508        if (preg_match('{\A(.*?)(\s*)([-,;:\}])\s+(.*)\z}', $line, $m)
509            && ($m[2] !== "" || $m[3] !== "-")) {
510            if (strcasecmp($m[1], "institution") === 0
511                || strcasecmp($m[1], "all") === 0)
512                return "All ($m[4])";
513            $sp1 = strpos($m[1], " ");
514            if (($m[3] !== "," || $sp1 !== false)
515                && !self::is_likely_affiliation($m[1]))
516                return "$m[1] ($m[4])";
517            if ($sp1 === false
518                && $m[3] === ","
519                && ($sp4 = strpos($m[4], " ")) !== false
520                && self::is_likely_affiliation(substr($m[4], $sp4 + 1), true))
521                return $m[1] . $m[2] . $m[3] . " " . substr($m[4], 0, $sp4)
522                    . " (" . substr($m[4], $sp4 + 1) . ")";
523        }
524        if (self::is_likely_affiliation($line))
525            return "All ($line)";
526        else
527            return $line;
528    }
529    static private function fix_collaborators_line_parens($line, $paren) {
530        $name = rtrim((string) substr($line, 0, $paren));
531        if (preg_match('{\A(?:|-|all|any|institution|none)\s*[.,:;\}]?\z}i', $name)) {
532            $line = "All " . substr($line, $paren);
533            $paren = 4;
534        }
535        // match parentheses
536        $pos = $paren + 1;
537        $depth = 1;
538        $len = strlen($line);
539        if (strpos($line, ")", $pos) === $len - 1) {
540            $pos = $len;
541            $depth = 0;
542        } else {
543            while ($pos < $len && $depth) {
544                if ($line[$pos] === "(")
545                    ++$depth;
546                else if ($line[$pos] === ")")
547                    --$depth;
548                ++$pos;
549            }
550        }
551        while ($depth > 0) {
552            $line .= ")";
553            ++$pos;
554            ++$len;
555            --$depth;
556        }
557        // check for abbreviation, e.g., "Massachusetts Institute of Tech (MIT)"
558        if ($pos === $len) {
559            $aff = substr($line, $paren + 1, $pos - $paren - 2);
560            if (ctype_upper($aff)
561                && ($aum = AuthorMatcher::make_affiliation($aff))
562                && $aum->test(substr($line, 0, $paren)))
563                $line = "All (" . rtrim(substr($line, 0, $paren)) . ")";
564            return $line;
565        }
566        // check for suffix
567        if (preg_match('{\G[-,:;.#()\s"]*\z}', $line, $m, 0, $pos))
568            return substr($line, 0, $pos);
569        if (preg_match('{\G(\s*-+\s*|\s*[,:;.#%(\[\{]\s*|\s*(?=[a-z/\s]+\z))}', $line, $m, 0, $pos)) {
570            $suffix = substr($line, $pos + strlen($m[1]));
571            $line = substr($line, 0, $pos);
572            if ($suffix !== "")
573                $line .= " - " . $suffix;
574            return $line;
575        }
576        if (strpos($line, "(", $pos) === false) {
577            if (preg_match('{\G([^,;]+)[,;]\s*(\S.+)\z}', $line, $m, 0, $pos))
578                $line = substr($line, 0, $pos) . $m[1] . " (" . $m[2] . ")";
579            else
580                $line .= " (unknown)";
581        }
582        return $line;
583    }
584
585    static function trim_collaborators($s) {
586        return preg_replace('{\s*#.*$|\ANone\z}im', "", $s);
587    }
588}
589