1<?php
2// text.php -- HotCRP text helper functions
3// Copyright (c) 2006-2018 Eddie Kohler; see LICENSE.
4
5class NameInfo {
6    public $firstName;
7    public $lastName;
8    public $affiliation;
9    public $email;
10    public $name;
11    public $orderedName;
12    public $unaccentedName;
13    public $middleName;
14    public $lastFirst;
15    public $nameAmbiguous;
16    public $nameAutosplit;
17    static function make_last_first() {
18        $ni = new NameInfo;
19        $ni->lastFirst = true;
20        return $ni;
21    }
22}
23
24class Text {
25    static private $argkeys = array("firstName", "lastName", "email",
26                                    "middleName", "lastFirst", "nameAmbiguous", "name");
27    static private $mapkeys = array("firstName" => "firstName",
28                                    "first" => "firstName",
29                                    "lastName" => "lastName",
30                                    "last" => "lastName",
31                                    "givenName" => "firstName",
32                                    "given" => "firstName",
33                                    "familyName" => "lastName",
34                                    "family" => "lastName",
35                                    "email" => "email",
36                                    "middleName" => "middleName",
37                                    "middle" => "middleName",
38                                    "lastFirst" => "lastFirst",
39                                    "nameAmbiguous" => "nameAmbiguous",
40                                    "name" => "name",
41                                    "fullName" => "name",
42                                    "affiliation" => "affiliation");
43    static private $boolkeys = array("lastFirst" => true,
44                                     "nameAmbiguous" => true);
45    static private $boring_words = [
46        "a" => true, "an" => true, "as" => true, "be" => true,
47        "by" => true, "did" => true, "do" => true, "for" => true,
48        "in" => true, "is" => true, "of" => true, "on" => true,
49        "the" => true, "this" => true, "through" => true, "to" => true,
50        "with" => true
51    ];
52
53    static function analyze_von($lastName) {
54        // see also split_name; NB intentionally case sensitive
55        if (preg_match('@\A((?:(?:v[ao]n|d[aeiu]|de[nr]|l[ae])\s+)+)(.*)\z@s', $lastName, $m))
56            return array(rtrim($m[1]), $m[2]);
57        else
58            return null;
59    }
60
61    static function analyze_name_args($args, $ret = null) {
62        $ret = $ret ? : new NameInfo;
63        // collect arguments
64        $delta = 0;
65        if (count($args) == 1 && is_string($args[0]))
66            $args = self::split_name($args[0], true);
67        foreach ($args as $i => $v) {
68            if (is_string($v) || is_bool($v)) {
69                if ($i + $delta < 4) {
70                    $k = self::$argkeys[$i + $delta];
71                    if (!isset($ret->$k))
72                        $ret->$k = $v;
73                }
74            } else if (is_array($v) && isset($v[0])) {
75                for ($j = 0; $j < 3 && $j < count($v); ++$j) {
76                    $k = self::$argkeys[$j];
77                    if (!isset($ret->$k))
78                        $ret->$k = $v[$j];
79                }
80            } else if (is_array($v)) {
81                foreach ($v as $k => $x)
82                    if (($mk = get(self::$mapkeys, $k))
83                        && !isset($ret->$mk))
84                        $ret->$mk = $x;
85                $delta = 3;
86            } else if (is_object($v)) {
87                foreach (self::$mapkeys as $k => $mk)
88                    if (!isset($ret->$mk)
89                        && isset($v->$k)
90                        && (isset(self::$boolkeys[$mk])
91                            ? is_bool($v->$k)
92                            : is_string($v->$k)))
93                        $ret->$mk = $v->$k;
94            }
95        }
96        // set defaults
97        $ret->firstName = (string) $ret->firstName;
98        $ret->lastName = (string) $ret->lastName;
99        $ret->email = (string) $ret->email;
100        // compute names
101        if ($ret->name !== "" && $ret->firstName === "" && $ret->lastName === "") {
102            list($ret->firstName, $ret->lastName) = self::split_name($ret->name);
103            $ret->nameAutosplit = true;
104        } else if ((string) $ret->middleName !== "")
105            $ret->firstName .= ($ret->firstName === "" ? "" : " ") . $ret->middleName;
106        if ($ret->firstName === "" || $ret->lastName === "")
107            $ret->name = $ret->firstName . $ret->lastName;
108        else
109            $ret->name = $ret->firstName . " " . $ret->lastName;
110        $ret->unaccentedName = $ret->orderedName = $ret->name;
111        if (preg_match('/[\x80-\xFF]/', $ret->name))
112            $ret->unaccentedName = UnicodeHelper::deaccent($ret->name);
113        if ($ret->lastFirst && $ret->firstName !== "" && $ret->lastName !== "")
114            $ret->orderedName = $ret->lastName . ", " . $ret->firstName;
115        return $ret;
116    }
117
118    static function analyze_name(/* ... */) {
119        return self::analyze_name_args(func_get_args());
120    }
121
122    static function user_text(/* ... */) {
123        // was contactText
124        $r = self::analyze_name_args(func_get_args());
125        if ($r->orderedName !== "" && $r->email !== "")
126            return "$r->orderedName <$r->email>";
127        else
128            return $r->orderedName ? : $r->email;
129    }
130
131    static function user_html(/* ... */) {
132        // was contactHtml
133        $r = self::analyze_name_args(func_get_args());
134        $e = htmlspecialchars($r->email);
135        if ($e !== "" && strpos($e, "@") !== false)
136            $e = "&lt;<a href=\"mailto:$e\" class=\"mailto\">$e</a>&gt;";
137        else if ($e !== "")
138            $e = "&lt;$e&gt;";
139        if ($r->orderedName !== "")
140            return htmlspecialchars($r->orderedName) . ($e ? " " . $e : "");
141        else
142            return $e ? : "[No name]";
143    }
144
145    static function user_html_nolink(/* ... */) {
146        $r = self::analyze_name_args(func_get_args());
147        if (($e = $r->email) !== "")
148            $e = "&lt;" . htmlspecialchars($e) . "&gt;";
149        if ($r->orderedName !== "")
150            return htmlspecialchars($r->orderedName) . ($e ? " " . $e : "");
151        else
152            return $e ? : "[No name]";
153    }
154
155    static function name_text(/* ... */) {
156        // was contactNameText
157        $r = self::analyze_name_args(func_get_args());
158        if ($r->nameAmbiguous && $r->orderedName !== "" && $r->email !== "")
159            return "$r->orderedName <$r->email>";
160        else
161            return $r->orderedName ? : $r->email;
162    }
163
164    static function name_html(/* ... */) {
165        // was contactNameHtml
166        $x = call_user_func_array("Text::name_text", func_get_args());
167        return htmlspecialchars($x);
168    }
169
170    static function user_email_to(/* ... */) {
171        // was contactEmailTo
172        $r = self::analyze_name_args(func_get_args());
173        if (($e = $r->email) === "")
174            $e = "none";
175        if (($n = $r->orderedName) !== "") {
176            if (preg_match('/[\000-\037()[\]<>@,;:\\".]/', $n))
177                $n = "\"" . addcslashes($n, '"\\') . "\"";
178            return "$n <$e>";
179        } else
180            return $e;
181    }
182
183    static function initial($s) {
184        $x = "";
185        if ((string) $s !== "") {
186            if (ctype_alpha($s[0]))
187                $x = $s[0];
188            else if (preg_match("/^(\\pL)/us", $s, $m))
189                $x = $m[1];
190            // Don't add a period if first name is a single letter
191            if ($x != "" && $x != $s && !str_starts_with($s, "$x "))
192                $x .= ".";
193        }
194        return $x;
195    }
196
197    static function abbrevname_text(/* ... */) {
198        $r = self::analyze_name_args(func_get_args());
199        $u = "";
200        if ($r->lastName !== "") {
201            $t = $r->lastName;
202            if ($r->firstName !== "" && ($u = self::initial($r->firstName)) !== "")
203                $u .= " "; // non-breaking space
204        } else if ($r->firstName !== "")
205            $t = $r->firstName;
206        else
207            $t = $r->email ? $r->email : "???";
208        return $u . $t;
209    }
210
211    static function abbrevname_html(/* ... */) {
212        // was abbreviateNameHtml
213        $x = call_user_func_array("Text::abbrevname_text", func_get_args());
214        return htmlspecialchars($x);
215    }
216
217    const SUFFIX_REGEX = 'Jr\.?|Sr\.?|Esq\.?|Ph\.?D\.?|M\.?[SD]\.?|Junior|Senior|Esquire|I+|IV|V|VI*|IX|XI*|2n?d|3r?d|[4-9]th|1\dth';
218
219    static function split_name($name, $with_email = false) {
220        $name = simplify_whitespace($name);
221
222        $ret = ["", ""];
223        if ($with_email) {
224            $email = "";
225            if ($name === "")
226                /* do nothing */;
227            else if ($name[strlen($name) - 1] === ">"
228                     && preg_match('{\A\"?(.*?)\"?\s*<([^<>]+)>\z}', $name, $m))
229                list($name, $email) = [$m[1], $m[2]];
230            else if ($name[0] === "\""
231                     && preg_match('{\A\s*\"(.*)\"\s+(\S+)\z}', $name, $m))
232                list($name, $email) = [$m[1], $m[2]];
233            else if (strpos($name, "@") === false)
234                /* skip */;
235            else if (!preg_match('{\A(.*?)\s+(\S+)\z}', $name, $m))
236                return ["", "", trim($name)];
237            else if (strpos($m[2], "@") !== false)
238                list($name, $email) = array($m[1], $m[2]);
239            else
240                list($name, $email) = array($m[2], $m[1]);
241            $ret[2] = $email;
242        }
243
244        // parenthetical comment on name attaches to first or last whole
245        $paren = "";
246        if ($name !== "" && $name[strlen($name) - 1] === ")"
247            && preg_match('{\A(.*?)(\s*\(.*?\))\z}', $name, $m)) {
248            $name = $m[1];
249            $paren = $m[2];
250        }
251
252        preg_match('{\A(.*?)((?:[, ]+(?:' . self::SUFFIX_REGEX . '))*)\z}i', $name, $m);
253        if (($comma = strrpos($m[1], ",")) !== false) {
254            $ret[0] = ltrim(substr($m[1], $comma + 1));
255            $ret[1] = rtrim(substr($m[1], 0, $comma)) . $m[2];
256            if ($paren !== "")
257                $ret[$m[2] === "" ? 0 : 1] .= $paren;
258        } else if (($space = strrpos($m[1], " ")) !== false) {
259            $ret[0] = substr($m[1], 0, $space);
260            $ret[1] = substr($m[1], $space + 1) . $m[2] . $paren;
261            // see also split_von
262            if (strpos($ret[0], " ") !== false
263                && preg_match('{\A(\S.*?)((?: (?:v[ao]n|d[aeiu]|de[nr]|l[ae]))+)\z}i', $ret[0], $m))
264                list($ret[0], $ret[1]) = [$m[1], ltrim($m[2]) . " " . $ret[1]];
265        } else if ($m[1] !== ""
266                   && $m[2] !== ""
267                   && preg_match('{\A((?: Junior| Senior| Esquire)*)(.*)\z}i', $m[2], $mm)) {
268            $ret[0] = $m[1];
269            $ret[1] = ltrim($m[2]) . $paren;
270        } else
271            $ret[1] = $name . $paren;
272
273        return $ret;
274    }
275
276    static function split_first_prefix($first) {
277        if (preg_match('%\A((?:dr\.?|mr\.?|mrs\.?|ms\.?|prof\.?)\s+)(?=\S)%i', $first, $m))
278            return [$m[2], $m[1]];
279        else
280            return [$first, ""];
281    }
282
283    static function split_first_middle($first) {
284        if (preg_match('%\A((?:\pL\.\s*)*\pL[^\s.]\S*)\s+(.*)\z%', $first, $m)
285            || preg_match('%\A(\pL[^\s.]\S*)\s*(.*)\z%', $first, $m))
286            return [$m[1], $m[2]];
287        else
288            return [$first, ""];
289    }
290
291    static function split_last_suffix($last) {
292        if (preg_match('{\A(.*?)[\s,]+(' . self::SUFFIX_REGEX . ')\z}i', $last, $m)) {
293            if (preg_match('{\A(?:jr|sr|esq)\z}i', $m[2]))
294                $m[2] .= ".";
295            return [$m[1], $m[2]];
296        } else
297            return [$last, ""];
298    }
299
300    static function unaccented_name(/* ... */) {
301        $x = self::analyze_name_args(func_get_args());
302        return $x->unaccentedName;
303    }
304
305    static function word_regex($word) {
306        if ($word === "")
307            return "";
308        list($aw, $zw) = array(ctype_alnum($word[0]),
309                               ctype_alnum($word[strlen($word) - 1]));
310        return ($aw ? '\b' : '')
311            . str_replace(" ", '\s+', preg_quote($word))
312            . ($zw ? '\b' : '');
313    }
314
315    const UTF8_INITIAL_NONLETTERDIGIT = '(?:\A|(?!\pL|\pN)\X)';
316    const UTF8_INITIAL_NONLETTER = '(?:\A|(?!\pL)\X)';
317    const UTF8_FINAL_NONLETTERDIGIT = '(?:\z|(?!\pL|\pN)(?=\PM))';
318    const UTF8_FINAL_NONLETTER = '(?:\z|(?!\pL)(?=\PM))';
319
320    static function utf8_word_regex($word) {
321        if ($word === "")
322            return "";
323        list($aw, $zw) = array(preg_match('{\A(?:\pL|\pN)}u', $word),
324                               preg_match('{(?:\pL|\pN)\z}u', $word));
325        // Maybe `$word` is not valid UTF-8. Avoid warnings later.
326        if (!$aw && !$zw && !is_valid_utf8($word))
327            return self::utf8_word_regex(convert_to_utf8($word));
328        return ($aw ? self::UTF8_INITIAL_NONLETTERDIGIT : '')
329            . str_replace(" ", '(?:\s|\p{Zs})+', preg_quote($word))
330            . ($zw ? self::UTF8_FINAL_NONLETTERDIGIT : '');
331    }
332
333    static function star_text_pregexes($word, $literal_star = false) {
334        if (is_object($word))
335            $reg = $word;
336        else
337            $reg = (object) ["value" => $word];
338
339        $word = preg_replace('/\s+/', " ", $reg->value);
340        if (!preg_match("/[\x80-\xFF]/", $word))
341            $reg->preg_raw = Text::word_regex($word);
342        $reg->preg_utf8 = Text::utf8_word_regex($word);
343
344        if (!$literal_star && strpos($word, "*") !== false) {
345            if ($reg->preg_raw)
346                $reg->preg_raw = str_replace('\\\\\S*', '\*', str_replace('\*', '\S*', $reg->preg_raw));
347            $reg->preg_utf8 = str_replace('\\\\\S*', '\*', str_replace('\*', '\S*', $reg->preg_utf8));
348        }
349
350        return $reg;
351    }
352
353    static function merge_pregexes($regex) {
354        if (empty($regex))
355            return false;
356        $a = $b = [];
357        foreach ($regex as $x)
358            if ($x) {
359                $a[] = $x->preg_utf8;
360                if (isset($x->preg_raw))
361                    $b[] = $x->preg_raw;
362            }
363        $x = (object) ["preg_utf8" => join("|", $a)];
364        if (count($a) == count($b))
365            $x->preg_raw = join("|", $b);
366        return $x;
367    }
368
369    static function match_pregexes($reg, $text, $deaccented_text) {
370        if (!$reg)
371            return false;
372        else if (!isset($reg->preg_raw))
373            return !!preg_match('{' . $reg->preg_utf8 . '}ui', $text);
374        else if ($deaccented_text && $deaccented_text !== $text)
375            return !!preg_match('{' . $reg->preg_utf8 . '}ui', $deaccented_text);
376        else
377            return !!preg_match('{' . $reg->preg_raw . '}i', $text);
378    }
379
380
381    static function highlight($text, $match, &$n = null) {
382        $n = 0;
383        if ($match === null || $match === false || $match === "" || $text == "")
384            return htmlspecialchars($text);
385
386        $mtext = $text;
387        $offsetmap = null;
388        $flags = "";
389        if (is_object($match)) {
390            if (!isset($match->preg_raw)) {
391                $match = $match->preg_utf8;
392                $flags = "u";
393            } else if (preg_match('/[\x80-\xFF]/', $text)) {
394                list($mtext, $offsetmap) = UnicodeHelper::deaccent_offsets($mtext);
395                $match = $match->preg_utf8;
396                $flags = "u";
397            } else
398                $match = $match->preg_raw;
399        }
400
401        $s = $clean_initial_nonletter = false;
402        if ($match !== null && $match !== "") {
403            if (str_starts_with($match, self::UTF8_INITIAL_NONLETTERDIGIT))
404                $clean_initial_nonletter = true;
405            if ($match[0] !== "{")
406                $match = "{(" . $match . ")}is" . $flags;
407            $s = preg_split($match, $mtext, -1, PREG_SPLIT_DELIM_CAPTURE);
408        }
409        if (!$s || count($s) == 1)
410            return htmlspecialchars($text);
411
412        $n = (int) (count($s) / 2);
413        if ($offsetmap)
414            for ($i = $b = $o = 0; $i < count($s); ++$i)
415                if ($s[$i] !== "") {
416                    $o += strlen($s[$i]);
417                    $e = UnicodeHelper::deaccent_translate_offset($offsetmap, $o);
418                    $s[$i] = substr($text, $b, $e - $b);
419                    $b = $e;
420                }
421        if ($clean_initial_nonletter)
422            for ($i = 1; $i < count($s); $i += 2)
423                if ($s[$i] !== ""
424                    && preg_match('{\A((?!\pL|\pN)\X)(.*)\z}us', $s[$i], $m)) {
425                    $s[$i - 1] .= $m[1];
426                    $s[$i] = $m[2];
427                }
428        for ($i = 0; $i < count($s); ++$i)
429            if (($i % 2) && $s[$i] !== "")
430                $s[$i] = '<span class="match">' . htmlspecialchars($s[$i]) . "</span>";
431            else
432                $s[$i] = htmlspecialchars($s[$i]);
433        return join("", $s);
434    }
435
436    const SEARCH_CASE_SENSITIVE = 1;
437    const SEARCH_UNPRIVILEGE_EXACT = 2;
438    const SEARCH_ONLY_EXACT = 4;
439    const SEARCH_NO_SPECIAL = 8;
440
441    static function simple_search($needle, $haystacks, $flags = 0) {
442        $reflags = $flags & self::SEARCH_CASE_SENSITIVE ? "" : "i";
443        $rewords = array();
444        foreach (preg_split('/[^A-Za-z_0-9*]+/', $needle) as $word)
445            if ($word !== "")
446                $rewords[] = str_replace("*", ".*", $word);
447        $matches = array();
448        $i = $flags & self::SEARCH_UNPRIVILEGE_EXACT ? 1 : 0;
449        $last = $flags & self::SEARCH_ONLY_EXACT ? $i : 2;
450        for (; $i <= $last && !count($matches); ++$i) {
451            if ($i == 0)
452                $re = ',\A' . join('\b.*\b', $rewords) . '\z,' . $reflags;
453            else if ($i == 1)
454                $re = ',\A' . join('\b.*\b', $rewords) . '\b,' . $reflags;
455            else
456                $re = ',\b' . join('.*\b', $rewords) . ',' . $reflags;
457            $matches = preg_grep($re, $haystacks);
458        }
459        return $matches;
460    }
461
462    static function is_boring_word($word) {
463        return isset(self::$boring_words[strtolower($word)]);
464    }
465
466    static function single_line_paragraphs($text) {
467        $lines = preg_split('/((?:\r\n?|\n)(?:[-+*][ \t]|\d+\.)?)/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
468        $n = count($lines);
469        for ($i = 1; $i < $n; $i += 2) {
470            if (strlen($lines[$i - 1]) > 49
471                && strlen($lines[$i]) <= 2
472                && $lines[$i + 1] !== ""
473                && $lines[$i + 1][0] !== " "
474                && $lines[$i + 1][0] !== "\t")
475                $lines[$i] = " ";
476        }
477        return join("", $lines);
478    }
479
480    static function html_to_text($x) {
481        if (strpos($x, "<") !== false) {
482            $x = preg_replace('{\s*<\s*p\s*>\s*(.*?)\s*<\s*/\s*p\s*>}si', "\n\n\$1\n\n", $x);
483            $x = preg_replace('{\s*<\s*br\s*/?\s*>\s*(?:<\s*/\s*br\s*>\s*)?}si', "\n", $x);
484            $x = preg_replace('{\s*<\s*li\s*>}si', "\n* ", $x);
485            $x = preg_replace('{<\s*(b|strong)\s*>\s*(.*?)\s*<\s*/\s*\1\s*>}si', '**$2**', $x);
486            $x = preg_replace('{<\s*(i|em)\s*>\s*(.*?)\s*<\s*/\s*\1\s*>}si', '*$2*', $x);
487            $x = preg_replace('{<(?:[^"\'>]|".*?"|\'.*?\')*>}s', "", $x);
488            $x = preg_replace('{\n\n\n+}s', "\n\n", $x);
489        }
490        return html_entity_decode(trim($x), ENT_QUOTES, "UTF-8");
491    }
492}
493