1<?php
2// abbreviationmatcher.php -- HotCRP abbreviation matcher helper class
3// Copyright (c) 2006-2018 Eddie Kohler; see LICENSE.
4
5// Match priority (higher = more priority):
6// 5. Exact match
7// 4. Exact match with [-_.–—] replaced by spaces
8// 3. Case-insensitive match with [-_.–—] replaced by spaces
9// 2. Case-insensitive word match with [-_.–—] replaced by spaces
10// 1. Case-insensitive CamelCase match with [-_.–—] replaced by spaces
11// If a word match is performed, prefer matches that match more complete words.
12// Words must appear in order, so pattern "hello kitty" does not match "kitty
13// hello".
14// If the pattern has no Unicode characters, these steps are performed against
15// the deaccented subject (so subjects “élan” and “elan” match pattern “elan”
16// with the same priority). If the pattern has Unicode characters, then exact
17// matches take priority over deaccented matches (so subject “élan” is a higher
18// priority match for pattern “élan”).
19
20class AbbreviationMatchTracker {
21    private $isu;
22    private $pattern;
23    private $dpattern;
24    private $upattern;
25    private $dupattern;
26    private $imatchre;
27    private $is_camel_word;
28    private $has_star;
29    private $camelwords;
30    private $mclass = 1;
31    private $matches = [];
32
33    function __construct($pattern, $isu = null) {
34        if ($isu === null)
35            $isu = !!preg_match('/[\x80-\xFF]/', $pattern);
36        $this->isu = $isu;
37        if ($isu) {
38            $this->pattern = UnicodeHelper::normalize($pattern);
39            $this->dpattern = AbbreviationMatcher::dedash($this->pattern);
40            $this->upattern = UnicodeHelper::deaccent($this->pattern);
41            $this->dupattern = AbbreviationMatcher::dedash($this->upattern);
42        } else {
43            $this->pattern = $this->upattern = $pattern;
44            $this->dpattern = $this->dupattern = AbbreviationMatcher::dedash($pattern);
45        }
46        $this->is_camel_word = AbbreviationMatcher::is_camel_word($pattern);
47        $this->has_star = strpos($pattern, "*") !== false;
48    }
49    private function wmatch_score($pattern, $subject, $flags) {
50        // assert($pattern whitespace is simplified)
51        $pwords = explode(" ", $pattern);
52        $swords = preg_split('{\s+}', $subject);
53        $ppos = $spos = $demerits = 0;
54        $pword = null;
55        $pword_pos = -1;
56        $pword_star = false;
57        while (isset($pwords[$ppos]) && isset($swords[$spos])) {
58            if ($pword_pos !== $ppos) {
59                $pword = '{\A' . preg_quote($pwords[$ppos]) . '(\S*)\z}' . $flags;
60                $pword_pos = $ppos;
61                if ($this->has_star && strpos($pwords[$ppos], "*") !== false) {
62                    $pword = str_replace('\\*', '.*', $pword);
63                    $pword_star = true;
64                } else
65                    $pword_star = false;
66            }
67            if (preg_match($pword, $swords[$spos], $m)) {
68                ++$ppos;
69                $demerits += ($m[1] !== "" || $pword_star ? 1 : 0);
70            } else
71                $demerits += 1;
72            ++$spos;
73        }
74        // missed words cost 1/64 point, partial words cost 1/64 point
75        if (!isset($pwords[$ppos])) {
76            if (!$this->has_star)
77                $demerits += count($swords) - $spos;
78            return 1 - 0.015625 * max(min($demerits, 63), 1);
79        } else
80            return 0;
81    }
82    private function camel_wmatch_score($subject) {
83        assert($this->is_camel_word);
84        if (!$this->camelwords) {
85            $this->camelwords = [];
86            $x = $this->pattern;
87            while (preg_match('{\A[-_.]*([a-z]+|[A-Z][a-z]*|[0-9]+)(.*)\z}', $x, $m)) {
88                $this->camelwords[] = $m[1];
89                $this->camelwords[] = $m[2] !== "" && ctype_alnum(substr($m[2], 0, 1));
90                $x = $m[2];
91            }
92        }
93        $swords = preg_split('{\s+}', $subject);
94        $ppos = $spos = $demerits = 0;
95        while (isset($this->camelwords[$ppos]) && isset($swords[$spos])) {
96            $pword = $this->camelwords[$ppos];
97            $sword = $swords[$spos];
98            $ppos1 = $ppos;
99            $sidx = 0;
100            while ($sidx + strlen($pword) <= strlen($sword)
101                   && strcasecmp($pword, substr($sword, $sidx, strlen($pword))) === 0) {
102                $sidx += strlen($pword);
103                $ppos += 2;
104                if (!$this->camelwords[$ppos - 1])
105                    break;
106                $pword = $this->camelwords[$ppos];
107            }
108            $demerits += ($sidx < strlen($sword) ? 1 : 0);
109            ++$spos;
110        }
111        if (!isset($this->camelwords[$ppos])) {
112            if (!$this->has_star)
113                $demerits += count($swords) - $spos;
114            return 1 - 0.015625 * max(min($demerits, 63), 1);
115        } else
116            return 0;
117    }
118    private function mclass($subject, $sisu = null) {
119        if ($sisu === null)
120            $sisu = !!preg_match('/[\x80-\xFF]/', $subject);
121
122        if ($this->isu && $sisu) {
123            if ($this->pattern === $subject)
124                return 9;
125
126            if ($this->mclass < 9) {
127                $dsubject = AbbreviationMatcher::dedash($subject);
128                if ($this->dpattern === $dsubject)
129                    return 8;
130            }
131
132            if ($this->mclass < 7) {
133                if (!$this->imatchre)
134                    $this->imatchre = '{\A' . preg_quote($this->dpattern) . '\z}iu';
135                if (preg_match($this->imatchre, $dsubject))
136                    return 7;
137            }
138
139            if ($this->mclass < 7) {
140                $s = $this->wmatch_score($this->dpattern, $dsubject, "iu");
141                if ($s)
142                    return 6 + $s;
143            }
144        }
145
146        if ($this->mclass < 6) {
147            $usubject = $sisu ? UnicodeHelper::deaccent($subject) : $subject;
148            if ($this->upattern === $usubject)
149                return 5;
150        }
151
152        if ($this->mclass < 5) {
153            $dusubject = AbbreviationMatcher::dedash($usubject);
154            if ($this->dupattern === $dusubject)
155                return 4;
156        }
157
158        if ($this->mclass < 4) {
159            if (strcasecmp($this->dupattern, $dusubject) === 0)
160                return 3;
161        }
162
163        if ($this->mclass < 3) {
164            $s = $this->wmatch_score($this->dupattern, $dusubject, "i");
165            if ($s)
166                return 2 + $s;
167        }
168
169        if ($this->mclass < 2 && $this->is_camel_word) {
170            $s = $this->camel_wmatch_score($dusubject);
171            if ($s)
172                return 1 + $s;
173        }
174
175        return 0;
176    }
177
178    function check($subject, $data, $sisu = null) {
179        $mclass = $this->mclass($subject, $sisu);
180        if ($mclass > $this->mclass) {
181            $this->mclass = $mclass;
182            $this->matches = [$data];
183        } else if ($mclass == $this->mclass
184                   && $this->matches[count($this->matches) - 1] !== $data)
185            $this->matches[] = $data;
186    }
187
188    function matches() {
189        return $this->matches;
190    }
191}
192
193class AbbreviationClass {
194    const TYPE_CAMELCASE = 0;
195    const TYPE_LOWERDASH = 1;
196    public $type = self::TYPE_CAMELCASE;
197    public $drop_parens = true;
198    public $nwords = 3;
199    public $stopwords = "";
200    public $tflags = 0;
201    public $index = 0;
202
203    function step() {
204        ++$this->index;
205        if ($this->index >= 1)
206            $this->drop_parens = false;
207        if ($this->index >= 2)
208            $this->stopwords = false;
209        if ($this->index > $this->nwords)
210            $this->nwords = $this->index;
211        return $this->index <= 5;
212    }
213}
214
215class AbbreviationMatcher {
216    private $data = [];
217    private $nanal = 0;
218    private $matches = [];
219    private $abbreviators = [];
220
221    function add($name, $data, $tflags = 0, $prio = 0) {
222        $this->data[] = [$name, null, $data, $tflags, $prio];
223        $this->matches = [];
224    }
225
226    function set_abbreviator($tflags, Abbreviator $abbreviator) {
227        $this->abbreviators[$tflags] = $abbreviator;
228    }
229
230    static function dedash($text) {
231        return preg_replace('{(?:[-_.\s]|–|—)+}', " ", $text);
232    }
233    static function is_camel_word($text) {
234        return preg_match('{\A[-_.A-Za-z0-9]*(?:[A-Za-z](?=[-_.A-Z0-9])|[0-9](?=[-_.A-Za-z]))[-_.A-Za-z0-9]*\*?\z}', $text);
235    }
236
237    private function _analyze() {
238        while ($this->nanal < count($this->data)) {
239            $name = $uname = simplify_whitespace($this->data[$this->nanal][0]);
240            if (preg_match('/[\x80-\xFF]/', $name)) {
241                $name = UnicodeHelper::normalize($name);
242                $uname = UnicodeHelper::deaccent($name);
243            }
244            $this->data[$this->nanal][0] = $name;
245            $this->data[$this->nanal][1] = self::dedash($uname);
246            ++$this->nanal;
247        }
248    }
249
250    private function _find_all($pattern) {
251        if (empty($this->matches))
252            $this->_analyze();
253        // A call to Abbreviator::abbreviation_for() might call back in
254        // to AbbreviationMatcher::find_all(). Short-circuit that call.
255        $this->matches[$pattern] = [];
256
257        $spat = $upat = simplify_whitespace($pattern);
258        if (($sisu = !!preg_match('/[\x80-\xFF]/', $spat))) {
259            $spat = UnicodeHelper::normalize($spat);
260            $upat = UnicodeHelper::deaccent($spat);
261        }
262        $dupat = self::dedash($upat);
263        if (self::is_camel_word($upat)) {
264            $re = preg_replace('{([A-Za-z](?=[A-Z0-9 ])|[0-9](?=[A-Za-z ]))}', '$1(?:|.*\b)', $dupat);
265            $re = '{\b' . str_replace(" ", "", $re) . '}i';
266        } else {
267            $re = join('.*\b', preg_split('{[^A-Za-z0-9*]+}', $dupat));
268            $re = '{\b' . str_replace("*", ".*", $re) . '}i';
269        }
270
271        $mclass = 0;
272        $matches = [];
273        foreach ($this->data as $i => $d) {
274            if (strcasecmp($dupat, $d[1]) === 0) {
275                if ($mclass === 0)
276                    $matches = [];
277                $mclass = 1;
278                $matches[] = $i;
279            } else if ($mclass === 0 && preg_match($re, $d[1]))
280                $matches[] = $i;
281        }
282
283        if (count($matches) > 1) {
284            $amt = new AbbreviationMatchTracker($spat, $sisu);
285            foreach ($matches as $i) {
286                $d = $this->data[$i];
287                $amt->check($d[0], $i, strlen($d[0]) !== strlen($d[1]));
288            }
289            $matches = $amt->matches();
290        }
291
292        if (empty($matches)) {
293            $last_abbreviator = $last_value = null;
294            $amt = new AbbreviationMatchTracker($spat, $sisu);
295            foreach ($this->data as $i => $d) {
296                if ($d[2] instanceof Abbreviator)
297                    $abbreviator = $d[2];
298                else if (isset($this->abbreviators[$d[3]]))
299                    $abbreviator = $this->abbreviators[$d[3]];
300                else
301                    continue;
302                if ($last_abbreviator === $abbreviator && $last_value === $d[2])
303                    continue;
304                $last_abbreviator = $abbreviator;
305                $last_value = $d[2];
306                if (($abbrs = $abbreviator->abbreviations_for($d[0], $d[2])))
307                    foreach (is_string($abbrs) ? [$abbrs] : $abbrs as $abbr)
308                        $amt->check($abbr, $i);
309            }
310            $matches = $amt->matches();
311        }
312
313        $this->matches[$pattern] = $matches;
314    }
315
316    function find_all($pattern, $tflags = 0) {
317        if (!array_key_exists($pattern, $this->matches))
318            $this->_find_all($pattern);
319        $results = [];
320        $last = $prio = false;
321        foreach ($this->matches[$pattern] as $i) {
322            $d = $this->data[$i];
323            if (!$tflags || ($d[3] & $tflags) != 0) {
324                if ($prio === false || $d[4] > $prio) {
325                    $results = [];
326                    $prio = $d[4];
327                }
328                if (empty($results) || $d[2] !== $last)
329                    $results[] = $last = $d[2];
330            }
331        }
332        return $results;
333    }
334
335    function find1($pattern, $tflags = 0) {
336        $a = $this->find_all($pattern, $tflags);
337        if (empty($a))
338            return false;
339        else if (count($a) == 1)
340            return $a[0];
341        else
342            return null;
343    }
344
345
346    function unique_abbreviation($name, $data, AbbreviationClass $aclass) {
347        $last = $aclass_clone = null;
348        while (true) {
349            $x = self::make_abbreviation($name, $aclass);
350            if ($last !== $x) {
351                $last = $x;
352                $a = $this->find_all($x, $aclass->tflags);
353                if (count($a) === 1 && $a[0] === $data)
354                    return $x;
355            }
356            if (!$aclass_clone)
357                $aclass = $aclass_clone = clone $aclass;
358            if (!$aclass->step())
359                return null;
360        }
361    }
362
363    static function make_abbreviation($name, AbbreviationClass $aclass) {
364        $name = str_replace("'", "", $name);
365        // try to filter out noninteresting words
366        if ($aclass->stopwords !== false) {
367            $stopwords = (string) $aclass->stopwords;
368            if ($stopwords !== "")
369                $stopwords .= "|";
370            $xname = preg_replace('/\b(?:' . $stopwords . 'a|an|and|be|did|do|for|in|of|or|the|their|they|this|to|with|you)\b/i', '', $name);
371            $name = $xname ? : $name;
372        }
373        // drop parenthetical remarks
374        if ($aclass->drop_parens)
375            $name = preg_replace('/\(.*?\)|\[.*?\]/', ' ', $name);
376        // drop unlikely punctuation
377        $xname = preg_replace('/[-:\s+,.?!()\[\]\{\}_\/\"]+/', " ", " $name ");
378        // drop extraneous words
379        $xname = preg_replace('/\A(' . str_repeat(' \S+', $aclass->nwords) . ' ).*\z/', '$1', $xname);
380        if ($aclass->type === AbbreviationClass::TYPE_CAMELCASE) {
381            $xname = str_replace(" ", "", ucwords($xname));
382            return preg_replace('/([A-Z][a-z][a-z])[a-z]*/', '$1', $xname);
383        } else
384            return strtolower(str_replace(" ", "-", trim($xname)));
385    }
386}
387