1<?php 2// text.php -- HotCRP text helper functions 3// Copyright (c) 2006-2018 Eddie Kohler; see LICENSE. 4 5class NameInfo { 6 public $firstName; 7 public $lastName; 8 public $affiliation; 9 public $email; 10 public $name; 11 public $orderedName; 12 public $unaccentedName; 13 public $middleName; 14 public $lastFirst; 15 public $nameAmbiguous; 16 public $nameAutosplit; 17 static function make_last_first() { 18 $ni = new NameInfo; 19 $ni->lastFirst = true; 20 return $ni; 21 } 22} 23 24class Text { 25 static private $argkeys = array("firstName", "lastName", "email", 26 "middleName", "lastFirst", "nameAmbiguous", "name"); 27 static private $mapkeys = array("firstName" => "firstName", 28 "first" => "firstName", 29 "lastName" => "lastName", 30 "last" => "lastName", 31 "givenName" => "firstName", 32 "given" => "firstName", 33 "familyName" => "lastName", 34 "family" => "lastName", 35 "email" => "email", 36 "middleName" => "middleName", 37 "middle" => "middleName", 38 "lastFirst" => "lastFirst", 39 "nameAmbiguous" => "nameAmbiguous", 40 "name" => "name", 41 "fullName" => "name", 42 "affiliation" => "affiliation"); 43 static private $boolkeys = array("lastFirst" => true, 44 "nameAmbiguous" => true); 45 static private $boring_words = [ 46 "a" => true, "an" => true, "as" => true, "be" => true, 47 "by" => true, "did" => true, "do" => true, "for" => true, 48 "in" => true, "is" => true, "of" => true, "on" => true, 49 "the" => true, "this" => true, "through" => true, "to" => true, 50 "with" => true 51 ]; 52 53 static function analyze_von($lastName) { 54 // see also split_name; NB intentionally case sensitive 55 if (preg_match('@\A((?:(?:v[ao]n|d[aeiu]|de[nr]|l[ae])\s+)+)(.*)\z@s', $lastName, $m)) 56 return array(rtrim($m[1]), $m[2]); 57 else 58 return null; 59 } 60 61 static function analyze_name_args($args, $ret = null) { 62 $ret = $ret ? : new NameInfo; 63 // collect arguments 64 $delta = 0; 65 if (count($args) == 1 && is_string($args[0])) 66 $args = self::split_name($args[0], true); 67 foreach ($args as $i => $v) { 68 if (is_string($v) || is_bool($v)) { 69 if ($i + $delta < 4) { 70 $k = self::$argkeys[$i + $delta]; 71 if (!isset($ret->$k)) 72 $ret->$k = $v; 73 } 74 } else if (is_array($v) && isset($v[0])) { 75 for ($j = 0; $j < 3 && $j < count($v); ++$j) { 76 $k = self::$argkeys[$j]; 77 if (!isset($ret->$k)) 78 $ret->$k = $v[$j]; 79 } 80 } else if (is_array($v)) { 81 foreach ($v as $k => $x) 82 if (($mk = get(self::$mapkeys, $k)) 83 && !isset($ret->$mk)) 84 $ret->$mk = $x; 85 $delta = 3; 86 } else if (is_object($v)) { 87 foreach (self::$mapkeys as $k => $mk) 88 if (!isset($ret->$mk) 89 && isset($v->$k) 90 && (isset(self::$boolkeys[$mk]) 91 ? is_bool($v->$k) 92 : is_string($v->$k))) 93 $ret->$mk = $v->$k; 94 } 95 } 96 // set defaults 97 $ret->firstName = (string) $ret->firstName; 98 $ret->lastName = (string) $ret->lastName; 99 $ret->email = (string) $ret->email; 100 // compute names 101 if ($ret->name !== "" && $ret->firstName === "" && $ret->lastName === "") { 102 list($ret->firstName, $ret->lastName) = self::split_name($ret->name); 103 $ret->nameAutosplit = true; 104 } else if ((string) $ret->middleName !== "") 105 $ret->firstName .= ($ret->firstName === "" ? "" : " ") . $ret->middleName; 106 if ($ret->firstName === "" || $ret->lastName === "") 107 $ret->name = $ret->firstName . $ret->lastName; 108 else 109 $ret->name = $ret->firstName . " " . $ret->lastName; 110 $ret->unaccentedName = $ret->orderedName = $ret->name; 111 if (preg_match('/[\x80-\xFF]/', $ret->name)) 112 $ret->unaccentedName = UnicodeHelper::deaccent($ret->name); 113 if ($ret->lastFirst && $ret->firstName !== "" && $ret->lastName !== "") 114 $ret->orderedName = $ret->lastName . ", " . $ret->firstName; 115 return $ret; 116 } 117 118 static function analyze_name(/* ... */) { 119 return self::analyze_name_args(func_get_args()); 120 } 121 122 static function user_text(/* ... */) { 123 // was contactText 124 $r = self::analyze_name_args(func_get_args()); 125 if ($r->orderedName !== "" && $r->email !== "") 126 return "$r->orderedName <$r->email>"; 127 else 128 return $r->orderedName ? : $r->email; 129 } 130 131 static function user_html(/* ... */) { 132 // was contactHtml 133 $r = self::analyze_name_args(func_get_args()); 134 $e = htmlspecialchars($r->email); 135 if ($e !== "" && strpos($e, "@") !== false) 136 $e = "<<a href=\"mailto:$e\" class=\"mailto\">$e</a>>"; 137 else if ($e !== "") 138 $e = "<$e>"; 139 if ($r->orderedName !== "") 140 return htmlspecialchars($r->orderedName) . ($e ? " " . $e : ""); 141 else 142 return $e ? : "[No name]"; 143 } 144 145 static function user_html_nolink(/* ... */) { 146 $r = self::analyze_name_args(func_get_args()); 147 if (($e = $r->email) !== "") 148 $e = "<" . htmlspecialchars($e) . ">"; 149 if ($r->orderedName !== "") 150 return htmlspecialchars($r->orderedName) . ($e ? " " . $e : ""); 151 else 152 return $e ? : "[No name]"; 153 } 154 155 static function name_text(/* ... */) { 156 // was contactNameText 157 $r = self::analyze_name_args(func_get_args()); 158 if ($r->nameAmbiguous && $r->orderedName !== "" && $r->email !== "") 159 return "$r->orderedName <$r->email>"; 160 else 161 return $r->orderedName ? : $r->email; 162 } 163 164 static function name_html(/* ... */) { 165 // was contactNameHtml 166 $x = call_user_func_array("Text::name_text", func_get_args()); 167 return htmlspecialchars($x); 168 } 169 170 static function user_email_to(/* ... */) { 171 // was contactEmailTo 172 $r = self::analyze_name_args(func_get_args()); 173 if (($e = $r->email) === "") 174 $e = "none"; 175 if (($n = $r->orderedName) !== "") { 176 if (preg_match('/[\000-\037()[\]<>@,;:\\".]/', $n)) 177 $n = "\"" . addcslashes($n, '"\\') . "\""; 178 return "$n <$e>"; 179 } else 180 return $e; 181 } 182 183 static function initial($s) { 184 $x = ""; 185 if ((string) $s !== "") { 186 if (ctype_alpha($s[0])) 187 $x = $s[0]; 188 else if (preg_match("/^(\\pL)/us", $s, $m)) 189 $x = $m[1]; 190 // Don't add a period if first name is a single letter 191 if ($x != "" && $x != $s && !str_starts_with($s, "$x ")) 192 $x .= "."; 193 } 194 return $x; 195 } 196 197 static function abbrevname_text(/* ... */) { 198 $r = self::analyze_name_args(func_get_args()); 199 $u = ""; 200 if ($r->lastName !== "") { 201 $t = $r->lastName; 202 if ($r->firstName !== "" && ($u = self::initial($r->firstName)) !== "") 203 $u .= " "; // non-breaking space 204 } else if ($r->firstName !== "") 205 $t = $r->firstName; 206 else 207 $t = $r->email ? $r->email : "???"; 208 return $u . $t; 209 } 210 211 static function abbrevname_html(/* ... */) { 212 // was abbreviateNameHtml 213 $x = call_user_func_array("Text::abbrevname_text", func_get_args()); 214 return htmlspecialchars($x); 215 } 216 217 const SUFFIX_REGEX = 'Jr\.?|Sr\.?|Esq\.?|Ph\.?D\.?|M\.?[SD]\.?|Junior|Senior|Esquire|I+|IV|V|VI*|IX|XI*|2n?d|3r?d|[4-9]th|1\dth'; 218 219 static function split_name($name, $with_email = false) { 220 $name = simplify_whitespace($name); 221 222 $ret = ["", ""]; 223 if ($with_email) { 224 $email = ""; 225 if ($name === "") 226 /* do nothing */; 227 else if ($name[strlen($name) - 1] === ">" 228 && preg_match('{\A\"?(.*?)\"?\s*<([^<>]+)>\z}', $name, $m)) 229 list($name, $email) = [$m[1], $m[2]]; 230 else if ($name[0] === "\"" 231 && preg_match('{\A\s*\"(.*)\"\s+(\S+)\z}', $name, $m)) 232 list($name, $email) = [$m[1], $m[2]]; 233 else if (strpos($name, "@") === false) 234 /* skip */; 235 else if (!preg_match('{\A(.*?)\s+(\S+)\z}', $name, $m)) 236 return ["", "", trim($name)]; 237 else if (strpos($m[2], "@") !== false) 238 list($name, $email) = array($m[1], $m[2]); 239 else 240 list($name, $email) = array($m[2], $m[1]); 241 $ret[2] = $email; 242 } 243 244 // parenthetical comment on name attaches to first or last whole 245 $paren = ""; 246 if ($name !== "" && $name[strlen($name) - 1] === ")" 247 && preg_match('{\A(.*?)(\s*\(.*?\))\z}', $name, $m)) { 248 $name = $m[1]; 249 $paren = $m[2]; 250 } 251 252 preg_match('{\A(.*?)((?:[, ]+(?:' . self::SUFFIX_REGEX . '))*)\z}i', $name, $m); 253 if (($comma = strrpos($m[1], ",")) !== false) { 254 $ret[0] = ltrim(substr($m[1], $comma + 1)); 255 $ret[1] = rtrim(substr($m[1], 0, $comma)) . $m[2]; 256 if ($paren !== "") 257 $ret[$m[2] === "" ? 0 : 1] .= $paren; 258 } else if (($space = strrpos($m[1], " ")) !== false) { 259 $ret[0] = substr($m[1], 0, $space); 260 $ret[1] = substr($m[1], $space + 1) . $m[2] . $paren; 261 // see also split_von 262 if (strpos($ret[0], " ") !== false 263 && preg_match('{\A(\S.*?)((?: (?:v[ao]n|d[aeiu]|de[nr]|l[ae]))+)\z}i', $ret[0], $m)) 264 list($ret[0], $ret[1]) = [$m[1], ltrim($m[2]) . " " . $ret[1]]; 265 } else if ($m[1] !== "" 266 && $m[2] !== "" 267 && preg_match('{\A((?: Junior| Senior| Esquire)*)(.*)\z}i', $m[2], $mm)) { 268 $ret[0] = $m[1]; 269 $ret[1] = ltrim($m[2]) . $paren; 270 } else 271 $ret[1] = $name . $paren; 272 273 return $ret; 274 } 275 276 static function split_first_prefix($first) { 277 if (preg_match('%\A((?:dr\.?|mr\.?|mrs\.?|ms\.?|prof\.?)\s+)(?=\S)%i', $first, $m)) 278 return [$m[2], $m[1]]; 279 else 280 return [$first, ""]; 281 } 282 283 static function split_first_middle($first) { 284 if (preg_match('%\A((?:\pL\.\s*)*\pL[^\s.]\S*)\s+(.*)\z%', $first, $m) 285 || preg_match('%\A(\pL[^\s.]\S*)\s*(.*)\z%', $first, $m)) 286 return [$m[1], $m[2]]; 287 else 288 return [$first, ""]; 289 } 290 291 static function split_last_suffix($last) { 292 if (preg_match('{\A(.*?)[\s,]+(' . self::SUFFIX_REGEX . ')\z}i', $last, $m)) { 293 if (preg_match('{\A(?:jr|sr|esq)\z}i', $m[2])) 294 $m[2] .= "."; 295 return [$m[1], $m[2]]; 296 } else 297 return [$last, ""]; 298 } 299 300 static function unaccented_name(/* ... */) { 301 $x = self::analyze_name_args(func_get_args()); 302 return $x->unaccentedName; 303 } 304 305 static function word_regex($word) { 306 if ($word === "") 307 return ""; 308 list($aw, $zw) = array(ctype_alnum($word[0]), 309 ctype_alnum($word[strlen($word) - 1])); 310 return ($aw ? '\b' : '') 311 . str_replace(" ", '\s+', preg_quote($word)) 312 . ($zw ? '\b' : ''); 313 } 314 315 const UTF8_INITIAL_NONLETTERDIGIT = '(?:\A|(?!\pL|\pN)\X)'; 316 const UTF8_INITIAL_NONLETTER = '(?:\A|(?!\pL)\X)'; 317 const UTF8_FINAL_NONLETTERDIGIT = '(?:\z|(?!\pL|\pN)(?=\PM))'; 318 const UTF8_FINAL_NONLETTER = '(?:\z|(?!\pL)(?=\PM))'; 319 320 static function utf8_word_regex($word) { 321 if ($word === "") 322 return ""; 323 list($aw, $zw) = array(preg_match('{\A(?:\pL|\pN)}u', $word), 324 preg_match('{(?:\pL|\pN)\z}u', $word)); 325 // Maybe `$word` is not valid UTF-8. Avoid warnings later. 326 if (!$aw && !$zw && !is_valid_utf8($word)) 327 return self::utf8_word_regex(convert_to_utf8($word)); 328 return ($aw ? self::UTF8_INITIAL_NONLETTERDIGIT : '') 329 . str_replace(" ", '(?:\s|\p{Zs})+', preg_quote($word)) 330 . ($zw ? self::UTF8_FINAL_NONLETTERDIGIT : ''); 331 } 332 333 static function star_text_pregexes($word, $literal_star = false) { 334 if (is_object($word)) 335 $reg = $word; 336 else 337 $reg = (object) ["value" => $word]; 338 339 $word = preg_replace('/\s+/', " ", $reg->value); 340 if (!preg_match("/[\x80-\xFF]/", $word)) 341 $reg->preg_raw = Text::word_regex($word); 342 $reg->preg_utf8 = Text::utf8_word_regex($word); 343 344 if (!$literal_star && strpos($word, "*") !== false) { 345 if ($reg->preg_raw) 346 $reg->preg_raw = str_replace('\\\\\S*', '\*', str_replace('\*', '\S*', $reg->preg_raw)); 347 $reg->preg_utf8 = str_replace('\\\\\S*', '\*', str_replace('\*', '\S*', $reg->preg_utf8)); 348 } 349 350 return $reg; 351 } 352 353 static function merge_pregexes($regex) { 354 if (empty($regex)) 355 return false; 356 $a = $b = []; 357 foreach ($regex as $x) 358 if ($x) { 359 $a[] = $x->preg_utf8; 360 if (isset($x->preg_raw)) 361 $b[] = $x->preg_raw; 362 } 363 $x = (object) ["preg_utf8" => join("|", $a)]; 364 if (count($a) == count($b)) 365 $x->preg_raw = join("|", $b); 366 return $x; 367 } 368 369 static function match_pregexes($reg, $text, $deaccented_text) { 370 if (!$reg) 371 return false; 372 else if (!isset($reg->preg_raw)) 373 return !!preg_match('{' . $reg->preg_utf8 . '}ui', $text); 374 else if ($deaccented_text && $deaccented_text !== $text) 375 return !!preg_match('{' . $reg->preg_utf8 . '}ui', $deaccented_text); 376 else 377 return !!preg_match('{' . $reg->preg_raw . '}i', $text); 378 } 379 380 381 static function highlight($text, $match, &$n = null) { 382 $n = 0; 383 if ($match === null || $match === false || $match === "" || $text == "") 384 return htmlspecialchars($text); 385 386 $mtext = $text; 387 $offsetmap = null; 388 $flags = ""; 389 if (is_object($match)) { 390 if (!isset($match->preg_raw)) { 391 $match = $match->preg_utf8; 392 $flags = "u"; 393 } else if (preg_match('/[\x80-\xFF]/', $text)) { 394 list($mtext, $offsetmap) = UnicodeHelper::deaccent_offsets($mtext); 395 $match = $match->preg_utf8; 396 $flags = "u"; 397 } else 398 $match = $match->preg_raw; 399 } 400 401 $s = $clean_initial_nonletter = false; 402 if ($match !== null && $match !== "") { 403 if (str_starts_with($match, self::UTF8_INITIAL_NONLETTERDIGIT)) 404 $clean_initial_nonletter = true; 405 if ($match[0] !== "{") 406 $match = "{(" . $match . ")}is" . $flags; 407 $s = preg_split($match, $mtext, -1, PREG_SPLIT_DELIM_CAPTURE); 408 } 409 if (!$s || count($s) == 1) 410 return htmlspecialchars($text); 411 412 $n = (int) (count($s) / 2); 413 if ($offsetmap) 414 for ($i = $b = $o = 0; $i < count($s); ++$i) 415 if ($s[$i] !== "") { 416 $o += strlen($s[$i]); 417 $e = UnicodeHelper::deaccent_translate_offset($offsetmap, $o); 418 $s[$i] = substr($text, $b, $e - $b); 419 $b = $e; 420 } 421 if ($clean_initial_nonletter) 422 for ($i = 1; $i < count($s); $i += 2) 423 if ($s[$i] !== "" 424 && preg_match('{\A((?!\pL|\pN)\X)(.*)\z}us', $s[$i], $m)) { 425 $s[$i - 1] .= $m[1]; 426 $s[$i] = $m[2]; 427 } 428 for ($i = 0; $i < count($s); ++$i) 429 if (($i % 2) && $s[$i] !== "") 430 $s[$i] = '<span class="match">' . htmlspecialchars($s[$i]) . "</span>"; 431 else 432 $s[$i] = htmlspecialchars($s[$i]); 433 return join("", $s); 434 } 435 436 const SEARCH_CASE_SENSITIVE = 1; 437 const SEARCH_UNPRIVILEGE_EXACT = 2; 438 const SEARCH_ONLY_EXACT = 4; 439 const SEARCH_NO_SPECIAL = 8; 440 441 static function simple_search($needle, $haystacks, $flags = 0) { 442 $reflags = $flags & self::SEARCH_CASE_SENSITIVE ? "" : "i"; 443 $rewords = array(); 444 foreach (preg_split('/[^A-Za-z_0-9*]+/', $needle) as $word) 445 if ($word !== "") 446 $rewords[] = str_replace("*", ".*", $word); 447 $matches = array(); 448 $i = $flags & self::SEARCH_UNPRIVILEGE_EXACT ? 1 : 0; 449 $last = $flags & self::SEARCH_ONLY_EXACT ? $i : 2; 450 for (; $i <= $last && !count($matches); ++$i) { 451 if ($i == 0) 452 $re = ',\A' . join('\b.*\b', $rewords) . '\z,' . $reflags; 453 else if ($i == 1) 454 $re = ',\A' . join('\b.*\b', $rewords) . '\b,' . $reflags; 455 else 456 $re = ',\b' . join('.*\b', $rewords) . ',' . $reflags; 457 $matches = preg_grep($re, $haystacks); 458 } 459 return $matches; 460 } 461 462 static function is_boring_word($word) { 463 return isset(self::$boring_words[strtolower($word)]); 464 } 465 466 static function single_line_paragraphs($text) { 467 $lines = preg_split('/((?:\r\n?|\n)(?:[-+*][ \t]|\d+\.)?)/', $text, -1, PREG_SPLIT_DELIM_CAPTURE); 468 $n = count($lines); 469 for ($i = 1; $i < $n; $i += 2) { 470 if (strlen($lines[$i - 1]) > 49 471 && strlen($lines[$i]) <= 2 472 && $lines[$i + 1] !== "" 473 && $lines[$i + 1][0] !== " " 474 && $lines[$i + 1][0] !== "\t") 475 $lines[$i] = " "; 476 } 477 return join("", $lines); 478 } 479 480 static function html_to_text($x) { 481 if (strpos($x, "<") !== false) { 482 $x = preg_replace('{\s*<\s*p\s*>\s*(.*?)\s*<\s*/\s*p\s*>}si', "\n\n\$1\n\n", $x); 483 $x = preg_replace('{\s*<\s*br\s*/?\s*>\s*(?:<\s*/\s*br\s*>\s*)?}si', "\n", $x); 484 $x = preg_replace('{\s*<\s*li\s*>}si', "\n* ", $x); 485 $x = preg_replace('{<\s*(b|strong)\s*>\s*(.*?)\s*<\s*/\s*\1\s*>}si', '**$2**', $x); 486 $x = preg_replace('{<\s*(i|em)\s*>\s*(.*?)\s*<\s*/\s*\1\s*>}si', '*$2*', $x); 487 $x = preg_replace('{<(?:[^"\'>]|".*?"|\'.*?\')*>}s', "", $x); 488 $x = preg_replace('{\n\n\n+}s', "\n\n", $x); 489 } 490 return html_entity_decode(trim($x), ENT_QUOTES, "UTF-8"); 491 } 492} 493